Beispiel #1
0
    # per envrionment step for epsilon greedy
    def on_step(self):
        k = max(self.eps_decay - self.num_timesteps, 0) / self.eps_decay
        self.eps = self.eps_final + k * (self.eps_init - self.eps_final)

    # This is for inference and evaluation of our model, returns the action
    def predict(self, x, deterministic=True):
        out = self.qnet(x)
        if deterministic:
            out = torch.max(out, dim=1)[1]
        else:
            eps = torch.rand_like(out[:, 0])
            eps = (eps < self.eps).float()
            out = eps * torch.rand_like(out).max(dim=1)[1] +\
                (1 - eps) * out.max(dim=1)[1]
        return out.long().cpu().numpy()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=3e-4)
        return optimizer


if __name__ == '__main__':
    model = Model(env='CartPole-v1', eval_env='CartPole-v1')

    trainer = pl.Trainer(max_epochs=20, gradient_clip_val=0.5)
    trainer.fit(model)

    rewards, lengths = model.evaluate(num_eval_episodes=10, render=True)
    print(np.mean(rewards), np.mean(lengths))
def main(args):
    # We first set the Random Seed for Everything.
    # Recommended to use seed=42 as that will give best results according to Douglas Adams
    pl.seed_everything(args.seed)

    # Data Augmentation Pipeline
    # Uses HSV color jitter to focus network on textural rather than color features.
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.RandomHorizontalFlip(),  # Do Random Flip
            transforms.RandomRotation(
                degrees=360),  # Rotate the image as orientation doesn't matter
            transforms.RandomCrop(
                (512, 512)),  # Crop to a smaller size randomly
            transforms.ColorJitter(hue=0.5),  # Color Jitter the hue
            transforms.Resize(
                (224, 224)),  # Resize to Resnet or VGG input Size
            transforms.Normalize(
                mean=MEAN_TRAIN, std=STD_TRAIN
            )  # Normalize to Imagenet Pixel Value Distribution
        ]),
        'val':
        transforms.Compose([
            transforms.CenterCrop((512, 512)),  # Center Crop
            transforms.Resize((224, 224)),  # Resize
            transforms.Normalize(
                mean=MEAN_TRAIN,
                std=STD_TRAIN)  # Normalize to pretrained imagenet weights
        ])
    }

    # Set base working path
    base_path = Path("..")

    # Initialize the training and Validation Datasets
    train_dataset_base = ThinSectionDataset(base_path,
                                            args.labelset,
                                            preload_images=True,
                                            transform=data_transforms['train'],
                                            train=True,
                                            seed=args.seed)

    val_dataset = ThinSectionDataset(base_path,
                                     args.labelset,
                                     preload_images=True,
                                     transform=data_transforms['val'],
                                     train=False,
                                     seed=args.seed)

    train_loader = DataLoader(train_dataset_base,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              pin_memory=args.pin_memory)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers,
                            pin_memory=args.pin_memory)

    if args.plot:
        visualize_batch(train_loader)
        visualize_batch(val_loader)

    # Use the Weights And Biases Logger
    wandb_logger = WandbLogger(name='lukas-mosser',
                               project='neural-rock',
                               entity='ccg')
    wandb_logger.experiment.config.update(args)

    # Checkpoint Models based on Validation F1 score
    checkpointer = ModelCheckpoint(monitor="val/f1", verbose=True, mode="max")

    # Initialize the Pytorch Lightning Trainer
    trainer = pl.Trainer(gpus=-1,
                         max_epochs=None,
                         logger=[wandb_logger],
                         callbacks=[checkpointer],
                         log_every_n_steps=args.log_every_n_steps,
                         distributed_backend=args.distributed_backend,
                         max_steps=args.steps,
                         benchmark=args.benchmark)

    # Make the Model
    if args.model == 'vgg':
        feature_extractor, classifier = make_vgg11_model(
            num_classes=train_dataset_base.num_classes, dropout=args.dropout)

    elif args.model == 'resnet':
        feature_extractor, classifier = make_resnet18_model(
            num_classes=train_dataset_base.num_classes, dropout=args.dropout)

    # Intialize the Neural Rock Model with the feature extractor and classifier
    model = NeuralRockModel(
        feature_extractor,
        classifier,
        train_dataset_base.num_classes,
        freeze_feature_extractor=args.freeze_feature_extractor)

    # Run actual training
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    args = parser.parse_args()
    return args


def infer_image_size(image_size):
    image_size = image_size.split(",")

    if len(image_size) > 2:
        image_size = image_size[:2]

    if len(image_size) == 1:
        image_size.append(image_size[0])

    image_size = list(map(int, image_size))
    return image_size


if __name__ == "__main__":
    args = parse_args()

    test_dataset = TestDataset(root=args.root, image_size=infer_image_size(args.image_size))
    test_dataloader = DataLoader(
        test_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, shuffle=False
    )

    model = RetinaNet.from_checkpoint(checkpoint_path=args.weights, name=args.name)
    logger = pl.loggers.TensorBoardLogger(args.output_dir, name="predictions")
    trainer = pl.Trainer(gpus=args.gpus, logger=logger)

    trainer.test(model, test_dataloaders=test_dataloader)
Beispiel #4
0
        return [optim], [scheduler]


# %%
# Train the MoCo model
# --------------------
#
# We can instantiate the model and train it using the
# lightning trainer.

# use a GPU if available
gpus = 1 if torch.cuda.is_available() else 0

model = MocoModel()
trainer = pl.Trainer(max_epochs=max_epochs,
                     gpus=gpus,
                     progress_bar_refresh_rate=100)
trainer.fit(model, dataloader_train_moco)

# %%
# Train the Classifier
model.eval()
classifier = Classifier(model.backbone)
trainer = pl.Trainer(max_epochs=max_epochs,
                     gpus=gpus,
                     progress_bar_refresh_rate=100)
trainer.fit(classifier, dataloader_train_classifier, dataloader_test)

# %%
# Checkout the tensorboard logs while the model is training.
#
Beispiel #5
0
    def train(
        self,
        train_data: Union[str, TokenDataset],
        output_dir: str = "trained_model",
        fp16: bool = False,
        fp16_opt_level: str = "O1",
        n_gpu: int = -1,
        tpu_cores: int = 0,
        max_grad_norm: float = 0.5,
        gradient_accumulation_steps: int = 1,
        seed: int = None,
        learning_rate: float = 1e-3,
        weight_decay: float = 0.05,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        num_steps: int = 5000,
        save_every: int = 1000,
        generate_every: int = 1000,
        n_generate: int = 1,
        loggers: List = None,
        batch_size: int = 1,
        num_workers: int = None,
        benchmark: bool = True,
        avg_loss_smoothing: float = 0.01,
        save_gdrive: bool = False,
        run_id: str = f"ATG_{datetime.utcnow():%Y%m%d_%H%M%S}",
        progress_bar_refresh_rate: int = 20,
        freeze_layers: bool = False,
        num_layers_freeze: int = None,
        use_deepspeed: bool = False,
        **kwargs,
    ) -> None:
        """
        Trains/finetunes the model on the provided file/dataset using pytorch-lightning.

        :param train_data: Either a TokenDataset containing the samples to be trained, or
        a string containing the text to be trained (shortcut instead of dataset)
        :param output_dir: A string indicating where to store the resulting
        model file folder.
        :param fp16: Boolean whether to use fp16, assuming using a compatible GPU/TPU.
        :param fp16_opt_level: Option level for FP16/APEX training.
        :param n_gpu: Number of GPU to use (-1 implies all available GPUs)
        :param tpu_cores: Number of TPU cores to use (should be a multiple of 8)
        :param max_grad_norm: Maximum gradient normalization
        :param gradient_accumulation_steps: Number of gradient acc steps
        :param seed: Interger representing the training seed.
        :param learning_rate: Training learnign rate for the default AdamW optimizer.
        :param weight_decay: Weight decay for the default AdamW optimizer.
        :param warmup_steps: Warmrup steps for the default AdamW optimizer.
        :param num_steps: Number of samples through the dataset.
        :param save_every: Number of steps for each time to save the model to disk
        :param generate_every: Number of steps for each time to generate sample text
        :param n_generate: Number of texts to generate when generate_every occurs.
        :param loggers: pytorch-lightning logger(s) to log results.
        :param batch_size: Number of input samples per batch
        :param num_workers: Number of DataLoader workers
        :param benchmark: If using GPU, whether to use cudnn.benchmarkl
        :param avg_loss_smoothing: Smoothing factor for Avg loss in progress bar
        :param save_gdrive: If using Colab, whether to save the notebook
        to Google Drive at each save_every
        :param run_id: Run identifier; used for save_gdrive
        :param progress_bar_refresh_rate: How often to update
        the progress bar while training.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if save_gdrive:
            assert (
                "google.colab" in sys.modules
            ), "You must be in Colaboratory to copy to your Google Drive"
            create_gdrive_folder(run_id)

        self.model = self.model.train()
        is_gpu_used = torch.cuda.is_available() and n_gpu != 0

        if isinstance(train_data, str):
            block_size = model_max_length(self.model.config)
            logger.info(
                f"Loading text from {train_data} with generation length of {block_size}."
            )
            train_data = TokenDataset(
                tokenizer=self.tokenizer,
                bos_token=self.bos_token,
                eos_token=self.eos_token,
                unk_token=self.unk_token,
                file_path=train_data,
                block_size=block_size,
                **kwargs,
            )

        if freeze_layers or self.openai_tf_gpt2 == "1558M":
            logger.info("Layer freezing enabled for model training.")
            freeze_layers = True
            if num_layers_freeze:
                assert (
                    num_layers_freeze < self.model.config.n_layer
                ), "You are freezing more Transformer layers than in the model."

        if num_workers is None:
            # Use all CPU cores as workers if not training on CPU
            if is_gpu_used or tpu_cores > 0:
                num_workers = os.cpu_count()
            # If training on the CPU, use half the CPUs
            else:
                num_workers = int(os.cpu_count() / 2)

        hparams = dict(
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            warmup_steps=warmup_steps,
            batch_size=batch_size,
            num_steps=num_steps,
            pin_memory=is_gpu_used,
            num_workers=num_workers,
            save_every=save_every,
            generate_every=generate_every,
            use_tpu=tpu_cores > 0,
        )

        # Wrap the model in a pytorch-lightning module
        train_model = ATGTransformer(self.model, train_data, hparams, self.tokenizer)

        # Begin training
        if seed:
            set_seed(seed)

        if os.path.exists(output_dir) and "pytorch_model.bin" in os.listdir(output_dir):
            logger.warning(
                f"pytorch_model.bin already exists in /{output_dir} and will be overwritten!"
            )

        # if try to use a GPU but no CUDA, use CPU
        if not is_gpu_used:
            n_gpu = 0

        # force single-GPU on Windows
        if platform.system() == "Windows" and is_gpu_used and n_gpu != 1:
            logger.warning(
                "Windows does not support multi-GPU training. Setting to 1 GPU."
            )
            n_gpu = 1

        # use the DeepSpeed plugin if installed and specified
        deepspeed_plugin = None
        if is_gpu_used and use_deepspeed:
            deepspeed_plugin = DeepSpeedPlugin()
            logger.info("Using DeepSpeed training.")
            if not fp16:
                logger.info("Setting FP16 to True for DeepSpeed ZeRO Training.")
                fp16 = True

        train_params = dict(
            accumulate_grad_batches=gradient_accumulation_steps,
            gpus=n_gpu,
            max_steps=num_steps,
            gradient_clip_val=max_grad_norm,
            checkpoint_callback=False,
            logger=loggers if loggers else False,
            weights_summary=None,
            progress_bar_refresh_rate=progress_bar_refresh_rate,  # ignored
            callbacks=[
                ATGProgressBar(
                    save_every,
                    generate_every,
                    output_dir,
                    n_generate,
                    is_gpu_used,
                    avg_loss_smoothing,
                    run_id,
                    save_gdrive,
                    progress_bar_refresh_rate,
                    freeze_layers,
                    num_layers_freeze,
                )
            ],
            plugins=deepspeed_plugin,
        )

        if fp16:
            train_params["precision"] = 16 if fp16 else 32
            train_params["amp_level"] = fp16_opt_level

        if tpu_cores > 0:
            train_params["tpu_cores"] = tpu_cores
            train_params["gpus"] = 0
            n_gpu = 0

        # benchmark gives a boost for GPUs if input size is constant,
        # which will always be the case with aitextgen training
        if is_gpu_used and benchmark:
            train_params["benchmark"] = True

        if n_gpu > 1:
            train_params["distributed_backend"] = "ddp"

        trainer = pl.Trainer(**train_params)
        trainer.fit(train_model)

        logger.info(f"Saving trained model pytorch_model.bin to /{output_dir}")

        self.model.save_pretrained(output_dir)

        if save_gdrive:
            for pt_file in ["pytorch_model.bin", "config.json"]:
                shutil.copyfile(
                    os.path.join(output_dir, pt_file),
                    os.path.join("/content/drive/My Drive/", run_id, pt_file),
                )

        if seed:
            reset_seed()
Beispiel #6
0
def main():
    pl.seed_everything(42)  # set seed

    # Argument Setting -------------------------------------------------------------------------------------------------
    parser = argparse.ArgumentParser()

    # mode specific --------------------------------------------------------------------------------
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to train text classifier.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to predict on real dataset.")

    # model specific -------------------------------------------------------------------------------
    parser.add_argument("--text_reader",
                        help="bert, kobert, koelectra, others, ...",
                        default="bert")

    # experiment settings --------------------------------------------------------------------------
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")  # bert has 512 tokens.
    parser.add_argument("--batch_size",
                        help="batch_size",
                        default=32,
                        type=int)
    parser.add_argument("--gpu_id", help="gpu device id", default="0")

    parser = pl.Trainer.add_argparse_args(parser)
    parser = NER.add_model_specific_args(parser)
    args = parser.parse_args()
    # ------------------------------------------------------------------------------------------------------------------

    # Dataset ----------------------------------------------------------------------------------------------------------
    from dataset import NER_Data_Module
    dm = NER_Data_Module("ner", args.text_reader, args.max_seq_length,
                         args.batch_size)
    dm.prepare_data()
    # ------------------------------------------------------------------------------------------------------------------

    # Model Checkpoint -------------------------------------------------------------------------------------------------
    from pytorch_lightning.callbacks import ModelCheckpoint
    model_name = '{}'.format(args.text_reader)
    model_folder = './model/{}/{}'.format("ner", model_name)
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath=model_folder,
        filename='{epoch:02d}-{val_loss:.2f}')
    # ------------------------------------------------------------------------------------------------------------------

    # Early Stopping ---------------------------------------------------------------------------------------------------
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        patience=3,
                                        verbose=True)
    # ------------------------------------------------------------------------------------------------------------------

    # Trainer ----------------------------------------------------------------------------------------------------------
    trainer = pl.Trainer(
        gpus=args.gpu_id
        if platform.system() != 'Windows' else 1,  # <-- for dev. pc
        checkpoint_callback=checkpoint_callback,
        callbacks=[early_stop_callback])
    # ------------------------------------------------------------------------------------------------------------------

    # Do train !
    if args.do_train:
        model = NER("ner", args.text_reader, dm.num_labels, dm.label_vocab)
        trainer.fit(model, dm)

    # Do predict !
    if args.do_predict:
        model_files = glob(os.path.join(model_folder, '*.ckpt'))
        best_fn = model_files[-1]
        model = NER.load_from_checkpoint(best_fn)
        trainer.test(model, test_dataloaders=[dm.test_dataloader()])
Beispiel #7
0
    cfg_name = args.cfg.split('/')[-1]

    if args.cfg is None:
        raise Exception('Cfg not specified')

    module = LyftModule(args.cfg)
    if args.resume is not None:
        print("loading from", args.resume)
        model = LyftModule.load_from_checkpoint(args.resume)
    default_root_dir = '/var/data/hdd1/{}/lyft_checkpoints/'.format(cfg_name)

    checkpoint_callback = ModelCheckpoint(
        filepath=default_root_dir,
        # filepath='/var/data/lyft_checkpoints/',
        save_top_k=5,
        verbose=True,
        monitor='avg_val_loss',
        mode='min',
        prefix='_')

    early_stop = EarlyStopping(monitor='avg_val_loss',
                               verbose=True,
                               patience=10,
                               mode='min')
    print('using default root dir', default_root_dir)
    trainer = pl.Trainer(gpus=1,
                         max_epochs=cfg['train_params']['epochs'],
                         default_root_dir=default_root_dir,
                         callbacks=[early_stop, checkpoint_callback])
    trainer.fit(module)
Beispiel #8
0
def _integration(data_with_covariates,
                 tmp_path,
                 gpus,
                 data_loader_kwargs={},
                 train_only=False,
                 **kwargs):
    data_loader_default_kwargs = dict(
        target="target",
        time_varying_known_reals=["price_actual"],
        time_varying_unknown_reals=["target"],
        static_categoricals=["agency"],
        add_relative_time_idx=True,
    )
    data_loader_default_kwargs.update(data_loader_kwargs)
    dataloaders_with_covariates = make_dataloaders(
        data_with_covariates, **data_loader_default_kwargs)
    train_dataloader = dataloaders_with_covariates["train"]
    val_dataloader = dataloaders_with_covariates["val"]
    test_dataloader = dataloaders_with_covariates["test"]
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=1e-4,
                                        patience=1,
                                        verbose=False,
                                        mode="min",
                                        strict=False)

    logger = TensorBoardLogger(tmp_path)
    trainer = pl.Trainer(
        max_epochs=3,
        gpus=gpus,
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback],
        enable_checkpointing=True,
        default_root_dir=tmp_path,
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        logger=logger,
    )

    net = DecoderMLP.from_dataset(train_dataloader.dataset,
                                  learning_rate=0.015,
                                  log_gradient_flow=True,
                                  log_interval=1000,
                                  hidden_size=10,
                                  **kwargs)
    net.size()
    try:
        if train_only:
            trainer.fit(net, train_dataloaders=train_dataloader)
        else:
            trainer.fit(
                net,
                train_dataloaders=train_dataloader,
                val_dataloaders=val_dataloader,
            )
        # check loading
        net = DecoderMLP.load_from_checkpoint(
            trainer.checkpoint_callback.best_model_path)

        # check prediction
        net.predict(val_dataloader,
                    fast_dev_run=True,
                    return_index=True,
                    return_decoder_lengths=True)
        # check test dataloader
        test_outputs = trainer.test(net, dataloaders=test_dataloader)
        assert len(test_outputs) > 0
    finally:
        shutil.rmtree(tmp_path, ignore_errors=True)

    net.predict(val_dataloader,
                fast_dev_run=True,
                return_index=True,
                return_decoder_lengths=True)
Beispiel #9
0
else:
    resume_from = None

lit = LitModel(opt)
# warning grad_clip_mode is ignored.
trainer = pl.Trainer(
    callbacks=[
        OnEpochStartCallback(),
        pl.callbacks.lr_logger.LearningRateLogger()
    ],
    default_root_dir=opt.checkpoint_path,
    resume_from_checkpoint=resume_from,
    distributed_backend='ddp',
    check_val_every_n_epoch=1,
    max_epochs=opt.max_epochs,
    gradient_clip_val=opt.grad_clip_value,
    gpus=torch.cuda.device_count(),
    checkpoint_callback=checkpoint_callback,
    log_gpu_memory='min_max',
    log_save_interval=opt.losses_log_every,
    profiler=True,
    row_log_interval=10,  # what is it?
    num_sanity_val_steps=0,
    # limit_train_batches=500,
    # progress_bar_refresh_rate=0,
    # fast_dev_run=True,
)

if os.getenv('EVALUATE', '0') == '1':
    trainer.test(lit)
else:
    trainer.fit(lit)
def main():

    # ------------
    # args
    # ------------
    torch.manual_seed(0)
    pl.seed_everything(0)

    parser = argparse.ArgumentParser()
    parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
    parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training')
    parser.add_argument('--epoch', type=int, dest='epoch', default=10, help='epoch size for training')
    parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.001, help='learning rate')
    parser.add_argument('--momentum', type=float, dest='momentum', default=0.9, help='momentum')
    parser.add_argument('--model-name', type=str, dest='model_name', default='resnet', help='Fine Turning model name')
    parser.add_argument('--optimizer', type=str, dest='optimizer', default='SGD', help='Optimzers to use for training.')
    parser.add_argument('--criterion', type=str, dest='criterion', default='cross_entropy', help='Loss Function to use for training.')
    parser.add_argument('--feature_extract', type=bool, dest='feature_extract', default=True, help='Flag for feature extracting. When False, we finetune the whole model, when True we only update the reshaped layer params')

    args = parser.parse_args()

    args.num_workers=8

    data_folder = args.data_folder
    print('training dataset is stored here:', data_folder)

    input_size = 224
    if args.model_name == "inception":
        input_size = 299
    # ---------------------------
    # Azure Machnie Learning
    # 1) get Azure ML run context and log hyperparameters
    # ---------------------------
    run = Run.get_context()
    run.log('model_name', args.model_name)
    run.log('optimizer', args.optimizer)
    run.log('criterion', args.criterion)

    run.log('lr', np.float(args.learning_rate))
    run.log('momentum', np.float(args.momentum))

    # For your tagging
#    run.tag('description', 'xxx')

    # ------------
    # data
    # ------------

    transform = transforms.Compose([
                    # Augmentation
    #                transforms.RandomHorizontalFlip(),
    #                transforms.RandomVerticalFlip(),
                    transforms.RandomAffine(degrees=[-10, 10], translate=(0.1, 0.1), scale=(0.5, 1.5)),
                    transforms.RandomRotation(degrees=10),
                    # Resize
                    transforms.Resize(int(input_size * 1.3)),
                    transforms.CenterCrop(input_size),
                    # Tensor
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    ]
    )

    dataset = torchvision.datasets.ImageFolder(args.data_folder, transform)
    args.num_classes = len(dataset.classes)

    n_train = int(len(dataset) * 0.7)
    n_val = int(len(dataset) * 0.15)
    n_test = len(dataset) - n_train - n_val

    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [n_train, n_val, n_test])

    train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers)
    val_loader = torch.utils.data.DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers)
    test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size)

    # Initialize the model for this run
    model_ft, input_size = initialize_model(args.model_name, args.num_classes, feature_extract=args.feature_extract , use_pretrained=True)
    model = FineTurningModel(args, model_ft)

    # GPU Configuration
    num_gpu = torch.cuda.device_count()
    print('num_gpu:', num_gpu)

    accelerator = None
    if num_gpu > 1:
        accelerator='ddp' # only for Single Machine

    # ------------
    # training
    # ------------
    trainer = pl.Trainer(max_epochs=args.epoch, gpus=num_gpu, accelerator=accelerator)
    trainer.fit(model, train_loader, val_loader)

    # ------------
    # Test (Not Validation)
    # ------------
    test_result = trainer.test(test_dataloaders=test_loader)
    test_result

    run.log('test_acc', [res["test_acc"] for res in test_result][0])
    run.log('test_loss', [res["test_loss"] for res in test_result][0])
    run.log('test_acc_epoch', [res["test_acc_epoch"] for res in test_result][0])
    run.log('test_loss_epoch', [res["test_loss_epoch"] for res in test_result][0])

    # ------------
    # save model
    # ------------
    outputdir = './outputs/model'
    os.makedirs(outputdir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(outputdir, 'model.dict'))
    torch.save(model, os.path.join(outputdir, 'model.pt'))
Beispiel #11
0
    cfg = parse_config(args.cfg_file)
    data_cfg = get_data_info(cfg['data'])
    cfg['data'] = data_cfg
    args.cfg = cfg
    ckpt_fd = "{}".format(args.output_directory) + "/{epoch:02d}_{train_mAP:.3f}_{val_mAP:.3f}"
    ckpt_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(
        filepath=ckpt_fd,
        verbose=True, save_top_k=-1
    )
    es_cb = pl.callbacks.EarlyStopping("val_mAP", mode="max", verbose=True, patience=10)

    mixer = mixers.BackgroundAddMixer()

    args.tr_mixer = mixers.UseMixerWithProb(mixer, args.mixer_prob)

    tr_tfs = get_transforms_fsd_chunks(True, 101)
    val_tfs = get_transforms_fsd_chunks(False, 101)

    args.tr_tfs = tr_tfs
    args.val_tfs = val_tfs

    net = FSD50k_Lightning(args)
    precision = 16 if args.fp16 else 32
    trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.epochs,
                         precision=precision, accelerator="dp",
                         num_sanity_val_steps=4170,
                         callbacks=[ckpt_callback, es_cb],
                         resume_from_checkpoint=args.resume_from,
                         logger=TensorBoardLogger(args.log_directory))
    trainer.fit(net)
def main():
    dm = DataModule(training_path=cfg['path']['training_path'],
                    validation_path=cfg['path']['validation_path'],
                    test_path=cfg['path']['test_path'],
                    num_workers=cfg['num_workers'],
                    size=cfg['size'],
                    batch_size=cfg['batch_size'],
                    means=cfg['means'],
                    std=cfg['std'])
    model = CustomTrainClass(model_train=cfg['model_train'],
                             num_classes=cfg['num_classes'],
                             diffaug_activate=cfg['diffaug_activate'],
                             policy=cfg['policy'],
                             aug=cfg['aug'])

    # skipping validation with limit_val_batches=0
    if cfg['use_amp'] == False:
        trainer = pl.Trainer(
            num_sanity_val_steps=0,
            stochastic_weight_avg=cfg['use_swa'],
            log_every_n_steps=50,
            resume_from_checkpoint=cfg['path']['checkpoint_path'],
            check_val_every_n_epoch=9999999,
            logger=None,
            gpus=cfg['gpus'],
            max_epochs=['max_epochs'],
            progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'],
            default_root_dir=cfg['default_root_dir'],
            callbacks=[
                CheckpointEveryNSteps(
                    save_step_frequency=cfg['save_step_frequency'],
                    save_path=cfg['path']['checkpoint_save_path'])
            ])
    if cfg['use_amp'] == True:
        trainer = pl.Trainer(
            num_sanity_val_steps=0,
            stochastic_weight_avg=cfg['use_swa'],
            log_every_n_steps=50,
            resume_from_checkpoint=cfg['path']['checkpoint_path'],
            check_val_every_n_epoch=9999999,
            logger=None,
            gpus=cfg['gpus'],
            precision=16,
            amp_level='O1',
            max_epochs=cfg['max_epochs'],
            progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'],
            default_root_dir=cfg['default_root_dir'],
            callbacks=[
                CheckpointEveryNSteps(
                    save_step_frequency=cfg['save_step_frequency'],
                    save_path=cfg['path']['checkpoint_save_path'])
            ])

    if cfg['path']['pretrain']:
        import torch
        model.netD.load_state_dict(torch.load(cfg['path']['pretrain']),
                                   strict=False)
        print("Pretrain pth loaded!")

    #############################################
    # Loading a Model
    #############################################
    # For resuming training
    if cfg['path']['checkpoint_path'] is not None:
        # load from checkpoint (optional) (using a model as pretrain and disregarding other parameters)
        #model = model.load_from_checkpoint(checkpoint_path) # start training from checkpoint, warning: apperantly global_step will be reset to zero and overwriting validation images, you could manually make an offset

        # continue training with checkpoint (does restore values) (optional)
        # https://github.com/PyTorchLightning/pytorch-lightning/issues/2613
        # https://pytorch-lightning.readthedocs.io/en/0.6.0/pytorch_lightning.trainer.training_io.html
        # https://github.com/PyTorchLightning/pytorch-lightning/issues/4333
        # dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'callbacks', 'optimizer_states', 'lr_schedulers', 'state_dict', 'hparams_name', 'hyper_parameters'])

        # To use DDP for local multi-GPU training, you need to add find_unused_parameters=True inside the DDP command
        model = model.load_from_checkpoint(cfg['path']['checkpoint_path'])
        #trainer = pl.Trainer(resume_from_checkpoint=checkpoint_path, logger=None, gpus=cfg['gpus'], max_epochs=cfg['datasets']['train']['max_epochs'], progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'], default_root_dir=cfg['default_root_dir'], callbacks=[CheckpointEveryNSteps(save_step_frequency=cfg['datasets']['train']['save_step_frequency'], save_path = cfg['path']['checkpoint_save_path'])])
        checkpoint = torch.load(cfg['path']['checkpoint_path'])
        trainer.checkpoint_connector.restore(checkpoint, on_gpu=True)
        trainer.checkpoint_connector.restore_training_state(checkpoint)
        pl.Trainer.global_step = checkpoint['global_step']
        pl.Trainer.epoch = checkpoint['epoch']
        print("Checkpoint was loaded successfully.")

    #############################################

    trainer.fit(model, dm)
Beispiel #13
0
    monitor="val_loss", 
    mode="min", 
    save_top_k=5)

    train_params = dict(
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.gpus,
        max_epochs=args.num_train_epochs,
        #early_stop_callback=True,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
        callbacks=[LoggingCallback()],
    )

    if args.n_gpu > 1:
        train_params["distributed_backend"] = "dp"

    #tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path)

    # initialize model
    model = SeqGenSQL(args)

    # restore full training state
    # trainer = pl.Trainer(resume_from_checkpoint='t5_checkpoints/epoch=15.ckpt', gpus=1, )
    # multi GPUs: 
    #trainer = pl.Trainer(resume_from_checkpoint='t5_checkpoints/base_gated_e03_0.2470.ckpt', **train_params)

    trainer = pl.Trainer(**train_params)

    # Train
    trainer.fit(model) 
Beispiel #14
0
    a = SSAD(confs)

    checkpoint_dir = os.path.join(confs["log_dir"], 'checkpoints/')
    checkpoint = ModelCheckpoint(checkpoint_dir,
                                 monitor='val_loss',
                                 mode='min',
                                 verbose=True,
                                 save_top_k=5)

    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        patience=20,
                                        verbose=True,
                                        mode='min')

    with open(os.path.join(confs["log_dir"], "confs.yml"), "w") as f:
        yaml.dump(confs, f)

    logger = TensorBoardLogger(os.path.dirname(confs["log_dir"]),
                               confs["log_dir"].split("/")[-1])

    trainer = pl.Trainer(
        max_nb_epochs=confs["training"]["n_epochs"],
        gpus=confs["gpus"],
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=confs["training"]["accumulate_batches"],
        early_stop_callback=early_stop_callback,
        logger=logger,
        gradient_clip=bool(confs["training"]["gradient_clip"]),
        gradient_clip_val=confs["training"]["gradient_clip"])
    trainer.fit(a)
Beispiel #15
0
def evaluate(args):
    """Evaluates a detection model based on a configuration

    :param args: argparse arguments specifying input configuration
        and output folder
    """

    # create data paths dict
    data_paths = dict()
    data_paths['train_images'] = args.train_images
    data_paths['train_labels'] = args.train_labels
    data_paths['test_images'] = args.test_images
    data_paths['test_labels'] = args.test_labels

    resume_ckpt = args.ckpt_path
    batch_size = args.batch_size
    minibatch_size = args.minibatch_size
    sampler = args.sampler
    lr = args.lr
    epochs = args.epochs
    no_classes = args.classes
    class_weights = args.weights
    input_size = args.input_size
    anchors = args.anchors
    output = args.output
    transform_norm_parameters = args.transform_norm_parameters

    logger = pl_loggers.TensorBoardLogger(
        '{}/custom_ssd_ckpt/logs'.format(output))

    # add background class
    no_classes = no_classes + 1

    # initialize model
    model, bbox_encoder = create_detection_model(anchors,
                                                 input_size=input_size,
                                                 no_classes=no_classes)

    loss = ODLoss(minibatch_size, sampler, class_weights)

    # initialize detection model
    train_model = None
    if os.path.isdir(resume_ckpt):
        tmodel_params = dict()
        tmodel_params['model'] = model
        tmodel_params['input_size'] = input_size
        tmodel_params['lr'] = lr
        tmodel_params['epochs'] = epochs
        tmodel_params['loss'] = loss

        resume_ckpt = io_ops.get_best_ckpt(resume_ckpt)
        train_model = DetectionModel.load_from_checkpoint(
            resume_ckpt, **tmodel_params)
    else:
        print("Checkpoint path is not a directory")
        exit(-1)

    # create checkpoint-creation callback
    checkpoint_callback = ModelCheckpoint(monitor='test_loss',
                                          save_top_k=3,
                                          save_last=True,
                                          mode='min')

    # initialize trainer
    trainer = pl.Trainer(logger=logger,
                         max_epochs=epochs,
                         gpus=1,
                         num_sanity_val_steps=0,
                         resume_from_checkpoint=resume_ckpt,
                         limit_train_batches=0,
                         limit_val_batches=0,
                         weights_save_path='{}/custom_rpn_ckpt'.format(output),
                         weights_summary='full',
                         callbacks=[checkpoint_callback])

    # initialize data module
    data_module = DataModule(batch_size, input_size, data_paths, no_classes,
                             bbox_encoder, transform_norm_parameters)

    # train model
    trainer.fit(train_model, data_module)

    # test model
    trainer.test(test_dataloaders=data_module.test_dataloader())

    version_number = trainer.logger.version
    version_path = '{}/custom_ssd_ckpt/default/version_{}'.format(
        output, version_number)

    if not os.path.exists(version_path):
        os.mkdir(version_path)

    print('Version is: {}'.format(version_path))

    # save all metrics in json
    preds = train_model.get_test_preds()

    io_ops.save_dict(preds, os.path.join(version_path, 'test_preds.json'))

    # coco evaluator
    cocoGt = COCO(data_paths['test_labels'])
    cocoDt = cocoGt.loadRes(os.path.join(version_path, 'test_preds.json'))

    annType = 'bbox'

    imgIds = sorted(cocoGt.getImgIds())

    cocoEval = COCOeval(cocoGt, cocoDt, annType)
    cocoEval.params.imgIds = imgIds
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()

    train_model.log('mAP IoU=0.50:0.95', round(cocoEval.stats[0] * 1, 2))
    train_model.log('mAP IoU=0.50', round(cocoEval.stats[1] * 1, 2))
Beispiel #16
0
                                     batch_size=64,
                                     num_workers=4)

    try:
        path = config["checkpoint"]
        experiment = Experiment.load_from_checkpoint(path)
    except KeyError:
        model_hpparams = model_config(config)
        print(model_hpparams)
        experiment = Experiment(**model_hpparams)

    trainer_config = config["trainer"]
    logger = TensorBoardLogger(prefix)
    if trainer_config == "tune":
        trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0,
                             logger=logger,
                             callbacks=[checkpoint_callback],
                             auto_lr_find=True)
        trainer.tune(experiment, train_dataloader, validate_dataloader)
    else:
        try:
            path = config["checkpoint"]
            trainer = pl.Trainer(resume_from_checkpoint=path,
                                 gpus=1 if torch.cuda.is_available() else 0,
                                 logger=logger,
                                 callbacks=[checkpoint_callback],
                                 max_epochs=4000)
        except KeyError:
            trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0,
                                 logger=logger,
                                 callbacks=[checkpoint_callback],
                                 max_epochs=4000)
Beispiel #17
0
def main(cfg):
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    model = HifiGanModel(cfg=cfg.model, trainer=trainer)
    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
    trainer.fit(model)
    train_config["batch_size"] = 256
    # train_config["accumulate_grad_batches"] = 12
    train_config["gradient_clip_val"] = 1.5
    train_config["learning_rate"] = 1e-4

    pl.seed_everything(42)

    wandb_logger = WandbLogger(name='PAN13_12-24-24_KUEP',
                               project='AVDV_PAN13')
    wandb_save(wandb_logger, train_config)

    model = LightningLongformerCLS(train_config)
    # model = LightningLongformerCLS.load_from_checkpoint("AVDV/2npel9bz/checkpoints/epoch=7-step=2639.ckpt", config=train_config)

    cp_valloss = ModelCheckpoint(save_top_k=5, monitor='val_loss', mode='min')
    trainer = pl.Trainer(
        max_epochs=train_config["epochs"],
        # accumulate_grad_batches=train_config["accumulate_grad_batches"],
        gradient_clip_val=train_config["gradient_clip_val"],
        gpus=[5],
        num_nodes=1,
        # accelerator='ddp',
        amp_backend='native',
        precision=16,
        logger=wandb_logger,
        log_every_n_steps=1,
        val_check_interval=0.5,
        limit_val_batches=40,
        checkpoint_callback=cp_valloss)

    trainer.fit(model)
Beispiel #19
0
        return torch.optim.Adam(self.parameters(), lr=1e-3)

    def train_dataloader(self):
        train_ds = ImageListDs(images=train.images.values,
                               labels=train.labels.values,
                               aug=train_aug)
        n = int(len(train_ds) / 10)
        sampler = RandomSampler(data_source=train_ds)
        train_loader = DataLoader(train_ds,
                                  shuffle=True,
                                  num_workers=12,
                                  batch_size=BATCH_SIZE)
        return train_loader

    def val_dataloader(self):
        valid_ds = ImageListDs(images=valid.images.values,
                               labels=valid.labels.values,
                               aug=valid_aug)
        valid_loader = DataLoader(valid_ds,
                                  shuffle=False,
                                  num_workers=12,
                                  batch_size=BATCH_SIZE)
        return valid_loader


torch.multiprocessing.freeze_support()
deep_fake_module = DeepFakeModule()

trainer = pl.Trainer(gpus=2, distributed_backend='dp', max_epochs=2)
trainer.fit(deep_fake_module)
    @torch.no_grad()
    def get_resnet_layers(self, x):
        activations = self.resnet_extractor(x)
        # activation_transform = {
        #     'early': nn.AvgPool2d(kernel_size=(2, 2), stride=(2, 2)),
        #     'middle': lambda x: x,
        #     'deep': nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
        # }
        activation_transform = {
            'early':
            lambda x: x,
            'middle':
            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
            'deep':
            nn.Upsample(scale_factor=4, mode="bilinear", align_corners=True),
        }
        return {
            'early': activation_transform['early'](activations["layer2"]),
            'middle': activation_transform['middle'](activations["layer3"]),
            'deep': activation_transform['deep'](activations["layer4"])
        }


if __name__ == "__main__":
    autoencoder = CorrespondenceEncoder()
    tb_logger = TensorBoardLogger('tb_logs',
                                  name='correspondence_encoder_lr1e3')
    trainer = pytorch_lightning.Trainer(
        logger=tb_logger, gpus=1 if torch.cuda.is_available() else None)
    dm = CorrespondenceDataModule()
    trainer.fit(autoencoder, dm)
Beispiel #21
0
    tst = tvt_dataset(dataset.source_tst, dataset.wizard_tst,
                      dataset.target_tst)

    trn_loader = DataLoader(train,
                            batch_size=hparams.batch_size,
                            collate_fn=pad_sequence)
    val_loader = DataLoader(val,
                            batch_size=hparams.batch_size,
                            collate_fn=pad_sequence)
    tst_loader = DataLoader(tst,
                            batch_size=hparams.batch_size,
                            collate_fn=pad_sequence)

    model = Task(hparams, dataset)

    logger = TensorBoardLogger('lightning_logs', name='sync')
    trainer = pl.Trainer(
        progress_bar_refresh_rate=10,
        logger=logger,
        max_epochs=hparams.max_epoch,
        auto_lr_find=True,
        # gpus = '0', #hparams.cuda,
        # accelerator = 'ddp',
        gradient_clip_val=hparams.clip,
    )

    trainer.fit(model, trn_loader, val_loader)
    trainer.test(model, tst_loader, ckpt_path='None')
    trainer.test(model, tst_loader, ckpt_path='best')
    print(f'Random seed = {hparams.seed}.')
Beispiel #22
0
    def test_trainer_loggers(self, cleanup_local_folder, tmp_path):
        """ Test that a trainer with logger errors out with a number of arguments. Test that it works with
        create_tensorboard_logger set to False
        """
        test_trainer = pl.Trainer()  # Should create logger and modelcheckpoint

        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"exp_dir": str(tmp_path)})
        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"explicit_log_dir": str(tmp_path)})
        with pytest.raises(LoggerMisconfigurationError
                           ):  # Fails because exp_manager defaults to trainer
            exp_manager(test_trainer, {"resume_if_exists": True})

        # Check that exp_manager uses trainer.logger, it's exp_dir, name, and version
        log_dir = exp_manager(
            test_trainer, {
                "create_tensorboard_logger": False,
                "create_checkpoint_callback": False
            })
        assert log_dir.resolve() == Path(
            "./lightning_logs/version_0").resolve()
        assert Path("./lightning_logs").exists()
        assert Path("./lightning_logs/version_0").exists()

        # Check that a trainer without a logger gets a logger attached to it
        test_trainer = pl.Trainer(logger=False)
        log_dir = exp_manager(
            test_trainer,
            {
                "create_tensorboard_logger": True,
                "create_checkpoint_callback": False,
                "exp_dir": str(tmp_path)
            },
        )
        assert isinstance(test_trainer.logger, pl.loggers.TensorBoardLogger)

        test_trainer = pl.Trainer(logger=False)
        # Check that a create_wandb_logger=True errors out unless wandb_logger_kwargs is passed.
        with pytest.raises(ValueError):
            log_dir = exp_manager(
                test_trainer,
                {
                    "create_tensorboard_logger": False,
                    "create_checkpoint_callback": False,
                    "exp_dir": str(tmp_path),
                    "create_wandb_logger": True,
                },
            )
        # Check that a WandbLogger is attached to logger if create_wandb_logger=True and wandb_logger_kwargs has name
        # and project
        log_dir = exp_manager(
            test_trainer,
            {
                "create_tensorboard_logger": False,
                "create_checkpoint_callback": False,
                "exp_dir": str(tmp_path),
                "create_wandb_logger": True,
                "wandb_logger_kwargs": {
                    "name": "",
                    "project": ""
                },
            },
        )
        assert isinstance(test_trainer.logger, pl.loggers.WandbLogger)
Beispiel #23
0
        return tg.loader.DataLoader(list(self.dataset),
                                    batch_size=self.batch_size,
                                    num_workers=self.num_workers,
                                    pin_memory=False,
                                    shuffle=True)

    def val_dataloader(self):
        return tg.loader.DataLoader(list(self.dataset),
                                    batch_size=self.batch_size,
                                    num_workers=self.num_workers,
                                    pin_memory=False,
                                    shuffle=True)


if __name__ == '__main__':
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    data_dir = os.path.join('GraphCoAttention', 'data')
    wandb.init()
    wandb_logger = WandbLogger(project='flux', log_model='all')
    trainer = pl.Trainer(gpus=[0],
                         max_epochs=2000,
                         check_val_every_n_epoch=500,
                         accumulate_grad_batches=1)
    trainer.fit(
        Learner(data_dir,
                bs=20,
                lr=0.0005,
                n_cycles=30,
                hidden_dim=25,
                n_head=4))
Beispiel #24
0
    def test_resume(self, tmp_path):
        """ Tests the resume capabilities of exp_manager"""
        test_trainer = pl.Trainer(checkpoint_callback=False, logger=False)

        # Error because explicit_log_dir does not exist
        with pytest.raises(NotFoundError):
            exp_manager(
                test_trainer,
                {
                    "exp_dir": str(tmp_path / "test_resume"),
                    "resume_if_exists": True,
                    "explicit_log_dir": "Does_not_exist",
                },
            )

        # Error because checkpoints folder does not exist
        with pytest.raises(NotFoundError):
            exp_manager(test_trainer, {
                "resume_if_exists": True,
                "exp_dir": str(tmp_path / "test_resume")
            })

        # No error because we tell exp_manager to ignore notfounderror
        exp_manager(
            test_trainer,
            {
                "resume_if_exists": True,
                "exp_dir": str(tmp_path / "test_resume_2"),
                "resume_ignore_no_checkpoint": True,
            },
        )

        test_trainer = pl.Trainer(checkpoint_callback=False, logger=False)
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints").mkdir(parents=True)
        # Error because checkpoints do not exist in folder
        with pytest.raises(NotFoundError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--end.ckpt").touch()
        # Error because *end.ckpt is in folder indicating that training has already finished
        with pytest.raises(ValueError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--end.ckpt").unlink()
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel--last.ckpt").touch()
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel2--last.ckpt").touch()
        # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore
        with pytest.raises(ValueError):
            exp_manager(
                test_trainer,
                {
                    "resume_if_exists":
                    True,
                    "explicit_log_dir":
                    str(tmp_path / "test_resume" / "default" / "version_0"),
                },
            )

        # Finally succeed
        Path(tmp_path / "test_resume" / "default" / "version_0" /
             "checkpoints" / "mymodel2--last.ckpt").unlink()
        log_dir = exp_manager(
            test_trainer,
            {
                "resume_if_exists":
                True,
                "explicit_log_dir":
                str(tmp_path / "test_resume" / "default" / "version_0")
            },
        )
        checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" /
                          "checkpoints" / "mymodel--last.ckpt")
        assert Path(test_trainer.resume_from_checkpoint).resolve(
        ) == checkpoint.resolve()

        # Succeed again and make sure that run_0 exists and previous log files were moved
        test_trainer = pl.Trainer(checkpoint_callback=False, logger=False)
        exp_manager(test_trainer, {
            "resume_if_exists": True,
            "explicit_log_dir": str(log_dir)
        })
        checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" /
                          "checkpoints" / "mymodel--last.ckpt")
        assert Path(test_trainer.resume_from_checkpoint).resolve(
        ) == checkpoint.resolve()
        prev_run_dir = Path(tmp_path / "test_resume" / "default" /
                            "version_0" / "run_0")
        assert prev_run_dir.exists()
        prev_log = Path(tmp_path / "test_resume" / "default" / "version_0" /
                        "run_0" / "lightning_logs.txt")
        assert prev_log.exists()
Beispiel #25
0
        max_seq_length=128,
        num_workers=8,
        num_preprocess_processes=96,
        use_sentence_selection=True,
        best_k_sentences=5,
    )
    # checkpoint_callback = ModelCheckpoint(
    #     dirpath='./result/checkpoints/',
    #     filename='epoch{epoch:02d}',
    #     save_top_k=-1,
    # )
    trainer = pl.Trainer(
        logger=tb_logger,
        gpus=-1 if torch.cuda.is_available() else None,
        # callbacks=[checkpoint_callback],
        amp_backend='native',
        amp_level='O2',
        precision=16,
        accelerator='ddp',
        gradient_clip_val=1.0,
        max_epochs=1,
        plugins='ddp_sharded',
        val_check_interval=0.2,
        # limit_train_batches=0.1,
        # limit_val_batches=0.1,
        # accumulate_grad_batches=2,
    )
    trainer.fit(model, dm)
    torch.save(model.model.state_dict(), 'pytorch_model.bin')
    trainer.test(datamodule=dm)
def train_pl():
    # Square linear
    dataset = MinatarDataset(name="dataset_random_3000_bullet_matched.json")
    # dataset = MinatarDataset(name="dataset_random_3000_new_matched.json")
    # dataset = MinatarDataset(name="dataset_random_3000_full_matched.json")
    # dataset = MinatarDataset(name="asterix_dataset_random_3000.json")
    dim_dict = dataset.get_dims()
    env_len = dim_dict["action_len"]
    obj_in_len = dim_dict["obj_len"]
    type_len = dim_dict["type_len"]

    # Prepare the dataloader
    dataset_size = len(dataset)
    train_size = int(dataset_size * 0.8)
    train_set, val_set = torch.utils.data.random_split(
        dataset, [train_size, dataset_size - train_size])
    train_data_loader = DataLoader(
        train_set, batch_size=1, num_workers=8,
        shuffle=True)  # num_workers=8, pin_memory=True,
    val_data_loader = DataLoader(val_set,
                                 batch_size=1,
                                 num_workers=8,
                                 pin_memory=True)

    # Initialize the model
    # model = SetDSPN(
    #     obj_in_len=obj_in_len,
    #     obj_reg_len=2,
    #     obj_type_len=type_len,
    #     env_len=env_len,
    #     latent_dim=64,
    #     out_set_size=3,
    #     n_iters=10,
    #     internal_lr=50,
    #     overall_lr=1e-3,
    #     loss_encoder_weight=1
    # )

    model = SetTransformer(obj_in_len=obj_in_len,
                           obj_reg_len=2,
                           obj_type_len=type_len,
                           env_len=env_len,
                           out_set_size=3,
                           learning_rate=1e-4)

    # Early stop callback
    # early_stop_callback = EarlyStopping(
    #     monitor='val_loss',
    #     min_delta=0.00,
    #     patience=3,
    #     verbose=False,
    #     mode='min'
    # )

    # Native train
    # optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    # for i, batch in enumerate(train_data_loader):
    #     print(i)
    #     s, a, sprime, sappear, r = batch
    #     s, a, sappear = s.to(model.device), a.to(model.device), sappear.to(model.device)
    #     pred = model(s, a)
    #     losses = model.loss_fn(pred, sappear)
    #
    #     optimizer.zero_grad()
    #     losses['loss_encoder'].backward()
    #     optimizer.step()
    #     pass

    # Train
    gpus = torch.cuda.device_count()
    trainer = pl.Trainer(
        gpus=1,
        precision=16,
        max_epochs=16,
        # check_val_every_n_epoch=4,
        accumulate_grad_batches=64,
        profiler="simple",
        auto_lr_find=True,
        log_every_n_steps=5,
        # callbacks=[early_stop_callback]
    )

    lr_finder = False
    if lr_finder:
        # Find the ideal lr
        lr_finder = trainer.tuner.lr_find(model,
                                          train_dataloader=train_data_loader,
                                          val_dataloaders=val_data_loader,
                                          max_lr=0.1,
                                          min_lr=1e-5)
        # Results can be found in
        lr_finder.results

        # Plot with
        fig = lr_finder.plot(suggest=True)
        fig.show()

        # Pick point based on plot, or get suggestion
        new_lr = lr_finder.suggestion()
    else:
        trainer.fit(model, train_data_loader, val_data_loader)

        # Evaluate
        # trainer.test(model, test_dataloaders = val_data_loader)
        evaluate(model=model)
Beispiel #27
0
def main(
    cfg: CfgNode,
    output_dir: Optional[str] = None,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_gpus: int = 0,
    num_processes: int = 1,
    accelerator: Optional[str] = "ddp",
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_gpus: Number of GPUs to train on each node
        num_processes: Number of processes on each node.
            NOTE: Automatically set to the number of GPUs when using DDP.
            Set a value greater than 1 to mimic distributed training on CPUs.
        accelerator: Backend for distributed training. Only DDP
            and DPP_CPU are supported.
        eval_only: True if run evaluation only.
    """
    assert (num_processes == 1 or num_gpus
            == 0), "Only set num_processes > 1 when training on CPUs"

    maybe_override_output_dir(cfg, output_dir)

    task = build_task(cfg, task_cls)
    tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR)
    trainer_params = {
        # training loop is bounded by max steps, use a large max_epochs to make
        # sure max_steps is met first
        "max_epochs":
        10**8,
        "max_steps":
        cfg.SOLVER.MAX_ITER,
        "val_check_interval":
        cfg.TEST.EVAL_PERIOD
        if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER,
        "num_nodes":
        num_machines,
        "gpus":
        num_gpus,
        "num_processes":
        num_processes,
        "accelerator":
        accelerator,
        "callbacks":
        _get_trainer_callbacks(cfg),
        "logger":
        tb_logger,
        "num_sanity_val_steps":
        0,
        "progress_bar_refresh_rate":
        10,
    }

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=tb_logger.log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
Beispiel #28
0
def train_mnist(config):
    model = LightningMNISTClassifier(config)
    trainer = pl.Trainer(max_epochs=10, show_progress_bar=False)

    trainer.fit(model)
Beispiel #29
0
def main(conf):
    train_set = WhamDataset(
        conf["data"]["train_dir"],
        conf["data"]["task"],
        sample_rate=conf["data"]["sample_rate"],
        segment=conf["data"]["segment"],
        nondefault_nsrc=conf["data"]["nondefault_nsrc"],
    )
    val_set = WhamDataset(
        conf["data"]["valid_dir"],
        conf["data"]["task"],
        sample_rate=conf["data"]["sample_rate"],
        nondefault_nsrc=conf["data"]["nondefault_nsrc"],
    )

    train_loader = DataLoader(
        train_set,
        shuffle=True,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
    )
    val_loader = DataLoader(
        val_set,
        shuffle=False,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
    )
    # Update number of source values (It depends on the task)
    conf["masknet"].update({"n_src": train_set.n_src})

    model = DPRNNTasNet(**conf["filterbank"], **conf["masknet"])
    optimizer = make_optimizer(model.parameters(), **conf["optim"])
    # Define scheduler
    scheduler = None
    if conf["training"]["half_lr"]:
        scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                      factor=0.5,
                                      patience=5)
    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf["main_args"]["exp_dir"]
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, "conf.yml")
    with open(conf_path, "w") as outfile:
        yaml.safe_dump(conf, outfile)

    # Define Loss function.
    loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx")
    system = System(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        config=conf,
    )

    # Define callbacks
    callbacks = []
    checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    checkpoint = ModelCheckpoint(checkpoint_dir,
                                 monitor="val_loss",
                                 mode="min",
                                 save_top_k=5,
                                 verbose=True)
    callbacks.append(checkpoint)
    if conf["training"]["early_stop"]:
        callbacks.append(
            EarlyStopping(monitor="val_loss",
                          mode="min",
                          patience=30,
                          verbose=True))

    # Don't ask GPU if they are not available.
    gpus = -1 if torch.cuda.is_available() else None
    distributed_backend = "ddp" if torch.cuda.is_available() else None
    trainer = pl.Trainer(
        max_epochs=conf["training"]["epochs"],
        callbacks=callbacks,
        default_root_dir=exp_dir,
        gpus=gpus,
        distributed_backend=distributed_backend,
        gradient_clip_val=conf["training"]["gradient_clipping"],
    )
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)

    state_dict = torch.load(checkpoint.best_model_path)
    system.load_state_dict(state_dict=state_dict["state_dict"])
    system.cpu()

    to_save = system.model.serialize()
    to_save.update(train_set.get_infos())
    torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
        for _, y in dl:
            labels = torch.cat((labels, y))
        counts = np.unique(labels.numpy(), return_counts=True)

        labels = torch.LongTensor([])
        for _, y in dl:
            labels = torch.cat((labels, y))
        weight = torch.ones((7, ))
        label, counts = torch.unique(labels, return_counts=True)
        weight[label] = weight[label] - counts / torch.sum(counts)

        model = MobileNetV2Lightning(num_classes=7,
                                     optimizer_args=optimizer_args,
                                     participant_name='1',
                                     weights=weight)
        trainer = pl.Trainer(max_epochs=3)
        trainer.fit(model, dl)
        models.append(model)

    from mlmi.clustering import flatten_model_parameter

    model_states = [m.state_dict() for m in models]
    keys = list(model_states[0].keys())
    model_parameter = np.array(
        [flatten_model_parameter(m, keys).numpy() for m in model_states],
        dtype=float)

    global_parameter = flatten_model_parameter(server.state_dict(),
                                               keys).cpu().numpy()
    euclidean_dist = np.array([
        ((model_parameter[participant_id] - global_parameter)**2).sum(axis=0)