Example #1
0
def final_train(p, load=False):
    data_ = load_data(root_dir='./data/', mode='train')
    data, target, features, date = preprocess_data(data_, nn=True)
    input_size = data.shape[-1]
    output_size = 1
    train_idx, val_idx = date[date <= 450].index.values.tolist(), date[
        date > 450].index.values.tolist()
    data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True)
    data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath='models/full_train',
        monitor="val_auc",
        mode='max',
        save_top_k=1,
        period=10)
    model = Classifier(input_size=input_size,
                       output_size=output_size,
                       params=p)
    if p['activation'] == nn.ReLU:
        model.apply(lambda m: init_weights(m, 'relu'))
    elif p['activation'] == nn.LeakyReLU:
        model.apply(lambda m: init_weights(m, 'leaky_relu'))
    dataset = FinData(data, target, date)
    dataloaders = create_dataloaders(dataset,
                                     indexes={
                                         'train': train_idx,
                                         'val': val_idx
                                     },
                                     batch_size=p['batch_size'])
    es = EarlyStopping(monitor='val_auc',
                       patience=10,
                       min_delta=0.0005,
                       mode='max')
    trainer = pl.Trainer(max_epochs=500,
                         gpus=1,
                         callbacks=[checkpoint_callback, es],
                         precision=16)
    trainer.fit(model,
                train_dataloader=dataloaders['train'],
                val_dataloaders=dataloaders['val'])
    torch.save(model.state_dict(), 'models/final_train.pth')
    return model, features
Example #2
0
def train(cfg):
    early_stop_callback = EarlyStopping(monitor=cfg.train.early_stop.loss,
                                        mode=cfg.train.early_stop.mode,
                                        patience=cfg.train.early_stop.patience)
    writer = MlflowWriter(EXPERIMENT_NAME)
    t5_dialogue_model = T5DialoguePlModel.load_from_checkpoint(
        '../../../outputs/2021-05-14/15-47-46/lightning_logs/version_0/checkpoints/epoch=105-step=25227.ckpt',
        max_epochs=1000,
        strict=False,
        cfg=cfg,
        writer=writer)
    trainer = pl.Trainer(gpus=1,
                         accumulate_grad_batches=8,
                         callbacks=[early_stop_callback])
    trainer.fit(t5_dialogue_model)
    writer.log_artifact(os.path.join(os.getcwd(), '.hydra/config.yaml'))
    writer.log_artifact(os.path.join(os.getcwd(), '.hydra/hydra.yaml'))
    writer.log_artifact(os.path.join(os.getcwd(), '.hydra/overrides.yaml'))
    writer.log_artifact(os.path.join(os.getcwd(), 'main.log'))
    writer.set_terminated()
def pytorch_model_with_callback(patience):
    mlflow.pytorch.autolog()
    model = IrisClassification()
    early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience, verbose=True)

    checkpoint_callback = ModelCheckpoint(
        filepath=os.getcwd(), save_top_k=1, verbose=True, monitor="val_loss", mode="min", prefix=""
    )

    trainer = pl.Trainer(
        max_epochs=NUM_EPOCHS * 2,
        callbacks=[early_stopping],
        checkpoint_callback=checkpoint_callback,
    )
    trainer.fit(model)

    client = mlflow.tracking.MlflowClient()
    run = client.get_run(client.list_run_infos(experiment_id="0")[0].run_id)

    return trainer, run
Example #4
0
    def test_early_stop_callback(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        from pytorch_lightning.callbacks.early_stopping import EarlyStopping

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            early_stop_callback = EarlyStopping(monitor='val_loss',
                                                min_delta=0.00,
                                                patience=3,
                                                verbose=True,
                                                mode='max')
            callbacks = [early_stop_callback]

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    callbacks=callbacks)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Example #5
0
def train_ae_model(data_dict):
    p = {
        'dim_1': 675,
        'dim_2': 400,
        'dim_3': 224,
        'hidden': 162,
        'activation': nn.ReLU,
        'dropout': 0.2916447561918717,
        'lr': 0.030272591341587315,
        'recon_loss_factor': 0.4447516076774931,
        'batch_size': 1252,
        'loss_sup_ae': nn.MSELoss,
        'loss_recon': nn.MSELoss,
        'embedding': True
    }
    train_idx = np.where(data_dict['era'] < 110)
    val_idx = np.where(data_dict['era'] > 110)
    p['input_size'] = len(data_dict['features'])
    p['output_size'] = 1
    dataset = utils.FinData(data=data_dict['data'],
                            target=data_dict['target'],
                            era=data_dict['era'])
    dataloaders = utils.create_dataloaders(dataset=dataset,
                                           indexes={
                                               'train': train_idx,
                                               'val': val_idx
                                           },
                                           batch_size=p['batch_size'])
    model = SupAE(p)
    es = EarlyStopping(monitor='val_loss',
                       patience=10,
                       min_delta=0.005,
                       mode='min')
    trainer = pl.Trainer(max_epochs=100, gpus=1, callbacks=[es])
    trainer.fit(model,
                train_dataloader=dataloaders['train'],
                val_dataloaders=dataloaders['val'])
    torch.save(model.state_dict(), f'./saved_models/trained/trained_ae.pth')
    return model
Example #6
0
def train(
    backbone: str,
    checkpoint_path: Path,
    batch_size: int = 20,
    learning_rate: float = 1e-3,
    gpus: int = 1,
    resume_path: Optional[Path] = None,
    summarize: bool = False,
    auto_scale_batch: Optional[str] = None,
    auto_learning_rate: bool = False,
    stage: str = "train",
):
    Model = ModelClass[backbone]
    model = Model(
        batch_size=batch_size, learning_rate=learning_rate, want_summary=summarize
    )

    data_module = HDRDataModule()

    trainer = pl.Trainer(
        gpus=gpus,
        auto_lr_find=auto_learning_rate,
        auto_scale_batch_size=auto_scale_batch,
        checkpoint_callback=True,
        callbacks=[
            EarlyStopping(monitor="val_loss", patience=15),
            ModelCheckpoint(
                dirpath=Path(checkpoint_path) / backbone,
                save_last=True,
                monitor="val_loss",
            ),
        ],
        resume_from_checkpoint=resume_path,
    )

    if stage == "train":
        trainer.fit(model, datamodule=data_module)

    trainer.test()
Example #7
0
def main(config):
    logger = config.get_logger('train')
    logger.info(config.log_dir)
    tb_logger = TensorBoardLogger(save_dir=config.log_dir)

    # setup data_loader instances
    data_loader = config.init_obj('data_loader', module_data)

    # get function handles of loss and metrics
    criterion = getattr(module_loss, config['loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build model architecture, then print to console
    model = config.init_obj('arch',
                            module_arch,
                            criterion=criterion,
                            metric_ftns=metrics,
                            config=config)
    logger.info(model)

    early_stop_mode, early_stop_monitor = config['trainer']['monitor'].split(
        ' ')
    early_stop_callback = EarlyStopping(
        monitor=early_stop_monitor,
        min_delta=0.00,
        patience=config['trainer']['early_stop'],
        verbose=False,
        mode=early_stop_mode)
    logger.info(f'Resume from file: {config.resume}')
    trainer = pl.Trainer(
        gpus=config['n_gpu'],
        logger=tb_logger,
        callbacks=[early_stop_callback],
        limit_train_batches=config['trainer']['train_batches'],
        limit_val_batches=config['trainer']['val_batches'],
        limit_test_batches=config['trainer']['test_batches'],
        default_root_dir=config['trainer']['save_dir'],
        resume_from_checkpoint=config.resume)
    trainer.fit(model, data_loader)
Example #8
0
def train_from_scratch(names_list: List[str], hparams:DotMap):
    dsrc = get_dataset(names_list)
    dls = dsrc.dataloaders(after_item=after_item, before_batch=pad_input_chunk_new, bs=32, n_inp=2)


    # get the model
    model = RNN(hparams, char2tensor = str(dict(dls.numericalize.o2i)), vocab=str(dls.numericalize.vocab))
    checkpoint_callback = ModelCheckpoint(
        dirpath = './checkpoints',
        filename='{epoch}',
        save_top_k=3,
        monitor='val_loss',
        mode='min'
    )
    trainer = pl.Trainer(fast_dev_run=False, auto_lr_find='learning_rate',gpus=1,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=5), checkpoint_callback],
                    )


    trainer.fit(model, dls.train, dls.valid)

    return trainer
Example #9
0
    def __init__(self,
                 multilingualIndex,
                 batch_size=128,
                 nepochs=50,
                 gpus=0,
                 n_jobs=-1,
                 patience=5,
                 stored_path=None):
        """
        Init Bert model
        :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
        indexed by language code.
        :param batch_size: int, number of samples per batch.
        :param nepochs: int, number of max epochs to train the model.
        :param gpus: int,  specifies how many GPUs to use per node. If False computation will take place on cpu.
        :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping.
        :param n_jobs: int, number of concurrent workers.
        :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
        """
        super().__init__()
        self.multilingualIndex = multilingualIndex
        self.nepochs = nepochs
        self.gpus = gpus
        self.batch_size = batch_size
        self.n_jobs = n_jobs
        self.stored_path = stored_path
        self.model = self._init_model()
        self.patience = patience
        self.logger = TensorBoardLogger(save_dir='tb_logs',
                                        name='bert',
                                        default_hp_metric=False)
        self.early_stop_callback = EarlyStopping(monitor='val-macroF1',
                                                 min_delta=0.00,
                                                 patience=self.patience,
                                                 verbose=False,
                                                 mode='max')

        # modifying EarlyStopping global var in order to compute >= with respect to the best score
        self.early_stop_callback.mode_dict['max'] = torch.ge
Example #10
0
def test_pytorch_with_early_stopping_autolog_log_models_configuration_with(
        log_models, patience):
    mlflow.pytorch.autolog(log_models=log_models)
    model = IrisClassification()
    dm = IrisDataModule()
    dm.prepare_data()
    dm.setup(stage="fit")
    early_stopping = EarlyStopping(monitor="val_loss",
                                   mode="min",
                                   patience=patience,
                                   verbose=True)

    with TempDir() as tmp:
        keyword = "dirpath" if LooseVersion(
            pl.__version__) >= LooseVersion("1.2.0") else "filepath"
        checkpoint_callback = ModelCheckpoint(
            **{keyword: tmp.path()},
            save_top_k=1,
            verbose=True,
            monitor="val_loss",
            mode="min",
            prefix="",
        )

        trainer = pl.Trainer(
            max_epochs=NUM_EPOCHS * 2,
            callbacks=[early_stopping],
            checkpoint_callback=checkpoint_callback,
        )
        trainer.fit(model, dm)

        client = mlflow.tracking.MlflowClient()
        run = client.get_run(
            client.list_run_infos(experiment_id="0")[0].run_id)
    run_id = run.info.run_id
    client = mlflow.tracking.MlflowClient()
    artifacts = [f.path for f in client.list_artifacts(run_id)]
    assert ("restored_model_checkpoint" in artifacts) == log_models
def get_callbacks(config, dm):
    #callbacks

    early_stopping = EarlyStopping(monitor='_valid_level0Accuracy',
                                   mode="max",
                                   patience=10,
                                   verbose=True,
                                   check_finite=True)

    checkpoint_callback = ModelCheckpoint(
        monitor='_val_loss',
        dirpath=config.PATH_CHECKPOINT,
        filename='-{epoch:02d}-{val_loss:.6f}',
        mode="min",
        save_last=True,
        save_top_k=3,
    )
    learning_rate_monitor = LearningRateMonitor(logging_interval="epoch")

    accuracytest = AccuracyEnd(dm.test_dataloader())
    plt_latent_space = PlotLatentSpace(dm.test_dataloader())
    freeze_layers_name = config.freeze_layers_name
    freeze_layer_enum = FreezeLayersAvailable[freeze_layers_name.lower()]
    if freeze_layer_enum == FreezeLayersAvailable.none:
        callbacks = [
            accuracytest,
            learning_rate_monitor,
            early_stopping,
            plt_latent_space,
        ]
    else:
        freeze_layers = FreezeLayers(freeze_layer_enum)
        callbacks = [
            accuracytest, learning_rate_monitor, early_stopping, freeze_layers,
            plt_latent_space
        ]

    return callbacks
Example #12
0
    def test_early_stop_callback(self):
        self.skipTest('There is a deadlock bug for early stop call back. ' +
                      'Will add this test back when it is solved.')

        from pytorch_lightning.callbacks.early_stopping import EarlyStopping

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            early_stop_callback = EarlyStopping(monitor='val_loss',
                                                min_delta=0.00,
                                                patience=3,
                                                verbose=True,
                                                mode='max')
            callbacks = [early_stop_callback]

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    callbacks=callbacks)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Example #13
0
def pytorch_model_with_callback(patience):
    mlflow.pytorch.autolog()
    model = IrisClassification()
    dm = IrisDataModule()
    dm.prepare_data()
    dm.setup(stage="fit")
    early_stopping = EarlyStopping(
        monitor="val_loss",
        mode="min",
        min_delta=99999999,  # forces early stopping
        patience=patience,
        verbose=True,
    )

    with TempDir() as tmp:
        keyword = "dirpath" if LooseVersion(
            pl.__version__) >= LooseVersion("1.2.0") else "filepath"
        checkpoint_callback = ModelCheckpoint(
            **{keyword: tmp.path()},
            save_top_k=1,
            verbose=True,
            monitor="val_loss",
            mode="min",
            prefix="",
        )

        trainer = pl.Trainer(
            max_epochs=NUM_EPOCHS * 2,
            callbacks=[early_stopping],
            checkpoint_callback=checkpoint_callback,
        )
        trainer.fit(model, dm)

        client = mlflow.tracking.MlflowClient()
        run = client.get_run(
            client.list_run_infos(experiment_id="0")[0].run_id)

    return trainer, run
Example #14
0
def fit_actor_model_to_data(actor_model, transition_tensors: dict,
                            hparams: dict):
    max_epochs = hparams["actor_model"]["max_epochs"]
    batch_size = hparams["actor_model"]["batch_size"]
    patience = hparams["actor_model"]["patience"]

    full_dataset = TransitionDataset(transition_tensors,
                                     ["state", "best_action"])

    train_dataloader, valid_dataloader = get_train_and_valid_dataloaders(
        full_dataset, batch_size)

    callbacks = [
        EarlyStopping(monitor="loss/valid", patience=patience),
    ]

    trainer = pl.Trainer(max_epochs=max_epochs,
                         callbacks=callbacks,
                         gpus=0,
                         checkpoint_callback=False,
                         logger=False)

    trainer.fit(actor_model, train_dataloader, valid_dataloader)
Example #15
0
    def create_callbacks(self, setting: SettingType) -> List[Callback]:
        """Create the PytorchLightning Callbacks for this Setting.

        These callbacks will get added to the Trainer in `create_trainer`.

        Parameters
        ----------
        setting : SettingType
            The `Setting` on which this Method is going to be applied.

        Returns
        -------
        List[Callback]
            A List of `Callaback` objects to use during training.
        """
        # TODO: Move this to something like a `configure_callbacks` method in the model,
        # once PL adds it.
        # from sequoia.common.callbacks.vae_callback import SaveVaeSamplesCallback
        return [
            EarlyStopping(monitor="val Loss")
            # self.hparams.knn_callback,
            # SaveVaeSamplesCallback(),
        ]
Example #16
0
def train(hparams):
    rdm = RetinalDataModule()

    model = get_model(hparams)
    logger = TensorBoardLogger('logs', name=get_exp_name(hparams), default_hp_metric=False)
    # log hparams to tensorboard
    logger.log_hyperparams(hparams, {
        'train_acc': 0,
        'train_f1': 0,
        'train_loss': 0,
        'valid_acc': 0,
        'valid_f1': 0,
        'valid_loss': 0,
        })
    trainer = pl.Trainer(gpus=1,
                        min_epochs=50,
                        max_epochs=hparams['n_epochs'],
                        logger=logger,
                        callbacks=[
                            EarlyStopping(monitor='valid_loss', patience=10, mode='min'),
                            ModelCheckpoint(monitor='valid_loss')
                        ])
    trainer.fit(model, rdm)
Example #17
0
                                    weight_decay=0.0002097517651377327)
        return optimizer


transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

dataset = datasets.ImageFolder('./train_sorted', transform)

esc = EarlyStopping(
    min_delta=0.00,
    patience=1,
    verbose=False,
    monitor='val_loss',
    mode='min',
)

# クロスバリデーション
# batch_size = 128

# n_trainval = int(len(dataset) * 0.8)
# n_test = len(dataset) - n_trainval
# trainval, test = torch.utils.data.random_split(dataset, [n_trainval, n_test])
# test_loader = torch.utils.data.DataLoader(test, batch_size, shuffle=False, num_workers=16)

# net = ClassNet()

# kf = KFold(n_splits=5, shuffle=True)
Example #18
0
def main(args: DictConfig):
    # Distributed training
    torch.multiprocessing.set_sharing_strategy('file_system')
    if str(args.exp.gpus) == '-1':
        args.exp.gpus = torch.cuda.device_count()

    # Secondary data args
    args.data.setting = 'in-topic' if args.data.test_id is None else 'cross-topic'
    dataset_name = args.data.path.split('/')[1]
    args.data.path = f'{ROOT_PATH}/{args.data.path}'

    # MlFlow Logging
    if args.exp.logging:
        experiment_name = f'{dataset_name}/{args.setting}-{args.data.setting}/{args.exp.task_name}'
        mlf_logger = MLFlowLogger(experiment_name=experiment_name,
                                  tracking_uri=MLFLOW_URI)
        experiment = mlf_logger._mlflow_client.get_experiment_by_name(
            experiment_name)
        if experiment is not None:
            experiment_id = experiment.experiment_id

            if args.exp.check_exisisting_hash:
                args.hash = calculate_hash(args)
                existing_runs = mlf_logger._mlflow_client.search_runs(
                    filter_string=f"params.hash = '{args.hash}'",
                    run_view_type=mlflow.tracking.client.ViewType.ACTIVE_ONLY,
                    experiment_ids=[experiment_id])
                if len(existing_runs) > 0:
                    logger.info('Skipping existing run.')
                    return
                else:
                    logger.info('No runs found - perfoming one.')

    #     cpnt_path = f'{ROOT_PATH}/mlruns/{experiment_id}/{run_id}/artifacts'
    # else:
    #     cpnt_path = None

    # Load pretrained model and tokenizer
    set_seed(args)
    model = instantiate(args.lightning_module, args=args)
    logger.info(f'Run arguments: \n{args.pretty()}')

    # Early stopping & Checkpointing
    early_stop_callback = EarlyStopping(
        min_delta=0.00,
        patience=args.exp.early_stopping_patience,
        verbose=False,
        mode='min')
    checkpoint_callback = CustomModelCheckpoint(
        model=model,
        verbose=True,
        mode='min',
        save_top_k=1,
        period=0 if args.exp.val_check_interval < 1.0 else 1)
    lr_logging_callback = LearningRateLogger(logging_interval='epoch')

    # Training
    trainer = Trainer(
        gpus=eval(str(args.exp.gpus)),
        logger=mlf_logger if args.exp.logging else None,
        max_epochs=args.exp.max_epochs,
        gradient_clip_val=args.optimizer.max_grad_norm,
        early_stop_callback=early_stop_callback,
        val_check_interval=args.exp.val_check_interval,
        checkpoint_callback=checkpoint_callback
        if args.exp.checkpoint else None,
        accumulate_grad_batches=args.exp.gradient_accumulation_steps,
        auto_lr_find=args.optimizer.auto_lr_find,
        precision=args.exp.precision,
        distributed_backend='dp',
        callbacks=[lr_logging_callback])
    trainer.fit(model)
    trainer.test(model)

    # Cleaning cache
    torch.cuda.empty_cache()

    # Ending the run
    if args.exp.logging:
        mlf_logger.finalize()
Example #19
0
def cli_main():

    parser = ArgumentParser()
    parser.add_argument("--DATA_PATH",
                        type=str,
                        help="path to folders with images to train on.")
    parser.add_argument("--VAL_PATH",
                        type=str,
                        default=None,
                        help="path to validation folders with images")
    parser.add_argument(
        "--model",
        type=str,
        help=
        "model to initialize. Can accept model checkpoint or just encoder name from models.py"
    )
    parser.add_argument("--batch_size",
                        default=128,
                        type=int,
                        help="batch size for SSL")
    parser.add_argument("--cpus",
                        default=1,
                        type=int,
                        help="number of cpus to use to fetch data")
    parser.add_argument(
        "--hidden_dim",
        default=128,
        type=int,
        help=
        "hidden dimensions in projection head or classification layer for finetuning"
    )
    parser.add_argument("--epochs",
                        default=400,
                        type=int,
                        help="number of epochs to train model")
    parser.add_argument("--learning_rate",
                        default=1e-3,
                        type=float,
                        help="learning rate for encoder")
    parser.add_argument(
        "--patience",
        default=-1,
        type=int,
        help=
        "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping."
    )
    parser.add_argument(
        "--val_split",
        default=0.2,
        type=float,
        help="percent in validation data. Ignored if VAL_PATH specified")
    parser.add_argument(
        "--withhold_split",
        default=0,
        type=float,
        help=
        "decimal from 0-1 representing how much of the training data to withold from either training or validation. Used for experimenting with labels neeeded"
    )
    parser.add_argument("--gpus",
                        default=1,
                        type=int,
                        help="number of gpus to use for training")
    parser.add_argument("--log_name",
                        type=str,
                        default=None,
                        help="name of model to log on wandb and locally")
    parser.add_argument("--image_size",
                        default=256,
                        type=int,
                        help="height of square image")
    parser.add_argument(
        "--resize",
        default=False,
        type=bool,
        help=
        "Pre-Resize data to right shape to reduce cuda memory requirements of reading large images"
    )
    parser.add_argument("--technique",
                        default=None,
                        type=str,
                        help="SIMCLR, SIMSIAM or CLASSIFIER")
    parser.add_argument("--seed",
                        default=1729,
                        type=int,
                        help="random seed for run for reproducibility")

    #add ability to parse unknown args
    args, _ = parser.parse_known_args()
    technique = supported_techniques[args.technique]
    args, _ = technique.add_model_specific_args(parser).parse_known_args()

    #logging
    wandb_logger = None
    log_name = args.technique + '_' + args.log_name + '.ckpt'
    if log_name is not None:
        wandb_logger = WandbLogger(name=log_name, project='Curator')

    #resize images here
    if args.resize:
        #implement resize and modify args.DATA_PATH accordingly
        pass

    #Splitting Data into train and validation
    if not (os.path.isdir(f"{args.DATA_PATH}/train")
            and os.path.isdir(f"{args.DATA_PATH}/val")
            ) and args.val_split != 0 and args.VAL_PATH is None:
        print(
            colored(
                f'Automatically splitting data into train and validation data...',
                'blue'))
        shutil.rmtree(f'./split_data_{log_name[:-5]}', ignore_errors=True)
        splitfolders.ratio(args.DATA_PATH,
                           output=f'./split_data_{log_name[:-5]}',
                           ratio=(1 - args.val_split - args.withhold_split,
                                  args.val_split, args.withhold_split),
                           seed=args.seed)
        args.DATA_PATH = f'./split_data_{log_name[:-5]}/train'
        args.VAL_PATH = f'./split_data_{log_name[:-5]}/val'

    model = load_model(args)
    print(colored("Model architecture successfully loaded", 'blue'))

    cbs = []
    backend = 'ddp'

    if args.patience > 0:
        cb = EarlyStopping('val_loss', patience=args.patience)
        cbs.append(cb)

    trainer = pl.Trainer(
        gpus=args.gpus,
        max_epochs=args.epochs,
        progress_bar_refresh_rate=20,
        callbacks=cbs,
        distributed_backend=f'{backend}' if args.gpus > 1 else None,
        sync_batchnorm=True if args.gpus > 1 else False,
        logger=wandb_logger,
        enable_pl_optimizer=True)
    trainer.fit(model)

    Path(f"./models/").mkdir(parents=True, exist_ok=True)
    trainer.save_checkpoint(f"./models/{log_name}")
    print(colored("YOUR MODEL CAN BE ACCESSED AT: ", 'blue'),
          f"./models/{log_name}")
Example #20
0
def get_early_stopping_callback():
    return EarlyStopping(
        monitor='train_loss',
        patience=40,
        mode='min',
    )
Example #21
0
            pin_memory=True,
            num_workers=4,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            PAWS_X("x-final/ko/dev_2k.tsv", "ko_KR", "ko_KR", 128),
            num_workers=4,
            batch_size=4,
            pin_memory=True,
        )


if __name__ == "__main__":
    # trainer = pl.Trainer(gpus=None)
    trainer = pl.Trainer(
        gpus=1,
        callbacks=[
            EarlyStopping(monitor="val_loss"),
            ModelCheckpoint(
                monitor="val_loss",
                filename="paraphrase_mbart_{epoch:02d}-{val_loss:.2f}",
                save_top_k=1,
                mode="min",
            ),
        ]
    )
    model = BartForSeq2SeqLM("ko_KR", "ko_KR")
    trainer.fit(model)
def train_model(args):
    # do not run this test for pytorch lightning below min supported verson
    import pytorch_lightning as pl
    if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION):
        print("Skip test for pytorch_ligthning=={}, min support version is {}".format(pl.__version__, MIN_PL_VERSION))
        return

    # Initialize SparkSession
    conf = SparkConf().setAppName('pytorch_spark_mnist').set('spark.sql.shuffle.partitions', '16')
    if args.master:
        conf.setMaster(args.master)
    elif args.num_proc:
        conf.setMaster('local[{}]'.format(args.num_proc))
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    # Setup our store for intermediate data
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(LightningModule):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = x.float().reshape((-1, 1, 28, 28))
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x, -1)

        def configure_optimizers(self):
            return optim.SGD(self.parameters(), lr=0.01, momentum=0.5)

        def training_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"training data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('train_loss', loss)
            return loss

        def validation_step(self, batch, batch_idx):
            if batch_idx == 0:
                print(f"validation data batch size: {batch['label'].shape}")
            x, y = batch['features'], batch['label']
            y_hat = self(x)
            loss = F.nll_loss(y_hat, y.long())
            self.log('val_loss', loss)

        def validation_epoch_end(self, outputs):
            avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() if len(outputs) > 0 else float('inf')
            self.log('avg_val_loss', avg_loss)

    model = Net()

    # Train a Horovod Spark Estimator on the DataFrame
    backend = SparkBackend(num_proc=args.num_proc,
                           stdout=sys.stdout, stderr=sys.stderr,
                           prefix_output_with_timestamp=True)

    from pytorch_lightning.callbacks import Callback

    epochs = args.epochs

    class MyDummyCallback(Callback):
        def __init__(self):
            self.epcoh_end_counter = 0
            self.train_epcoh_end_counter = 0
            self.validation_epoch_end_counter = 0

        def on_init_start(self, trainer):
            print('Starting to init trainer!')

        def on_init_end(self, trainer):
            print('Trainer is initialized.')

        def on_epoch_end(self, trainer, model):
            print('A train or eval epoch ended.')
            self.epcoh_end_counter += 1

        def on_train_epoch_end(self, trainer, model, unused=None):
            print('A train epoch ended.')
            self.train_epcoh_end_counter += 1

        def on_validation_epoch_end(self, trainer, model, unused=None):
            print('A val epoch ended.')
            self.validation_epoch_end_counter += 1

        def on_train_end(self, trainer, model):
            print("Training ends:"
                  f"epcoh_end_counter={self.epcoh_end_counter}, "
                  f"train_epcoh_end_counter={self.train_epcoh_end_counter}, "
                  f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n")
            assert self.train_epcoh_end_counter <= epochs
            assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter

    callbacks = [MyDummyCallback()]

    # added EarlyStopping and ModelCheckpoint
    from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
    callbacks.append(ModelCheckpoint(monitor='val_loss', mode="min",
                                     save_top_k=1, verbose=True))

    from pytorch_lightning.callbacks.early_stopping import EarlyStopping
    callbacks.append(EarlyStopping(monitor='val_loss',
                                   min_delta=0.001,
                                   patience=3,
                                   verbose=True,
                                   mode='min'))

    torch_estimator = hvd.TorchEstimator(backend=backend,
                                         store=store,
                                         model=model,
                                         input_shapes=[[-1, 1, 28, 28]],
                                         feature_cols=['features'],
                                         label_cols=['label'],
                                         batch_size=args.batch_size,
                                         epochs=args.epochs,
                                         validation=0.1,
                                         verbose=1,
                                         callbacks=callbacks,
                                         profiler="simple" if args.enable_profiler else None)

    torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = torch_model.transform(test_df)

    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
Example #23
0
cp_save_dir = os.path.join(os.getcwd(), "CKP", model_file_name)


logger = TensorBoardLogger(
    save_dir=tb_save_dir,
    name=model_file_name
)

checkpoint_callback = ModelCheckpoint(
    filepath=cp_save_dir,
    save_top_k=1,
    verbose=True,
    monitor='loss_val',
    mode='min'
)

early_stop_callback = EarlyStopping(monitor='loss_val', verbose=True, mode=min)


trainer = pl.Trainer(gpus=1, max_epochs=hparams["max_epochs"], weights_summary=None,
                     logger=logger, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback])

trainer.fit(model, train_dataloader, val_dataloader)

print("Best Model Path", checkpoint_callback.best_model_path)
best_model_path = checkpoint_callback.best_model_path

print(trainer.test(model, test_dataloaders=test_dataloader))

model = model.to('cpu')
        features="coco-bottomup-36",
        dir_data=args.dir_data,
        min_ans_occ=9,
    )

    model = SimpleVQAModel(
        answers=vqa_train.answers,
        train_dataset=vqa_train,
        val_dataset=vqa_val,
        dir_data=args.dir_data,
    )
    trainer = pl.Trainer(
        gpus=1,
        default_root_dir=args.root_dir,
        max_epochs=50,
        callbacks=[EarlyStopping(monitor="Accuracy/val_acc_overall")],
    )

    trainer.fit(
        model,
        DataLoader(
            vqa_train,
            batch_size=args.batch_size,
            collate_fn=VQA2.collate_fn,
            num_workers=args.num_workers,
            shuffle=True,
        ),
        DataLoader(
            vqa_val,
            batch_size=args.batch_size,
            collate_fn=VQA2.collate_fn,
Example #25
0
# the model here should be constructed in the script accordingly to the passed config (including the model type)
# most of the models accept `sample_rate` parameter for encoders, which is important (default is 16000, override)
#model = DCUNet("DCUNet-20", fix_length_mode="trim", sample_rate=SAMPLE_RATE)
model = ConvTasNet(n_src=1)
checkpoint = ModelCheckpoint(
        filename='{epoch:02d}-{val_loss:.2f}',
        monitor="val_loss",
        mode="min",
        save_top_k=5,
        verbose=True
    )

optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=REDUCE_LR_PATIENCE)
early_stopping = EarlyStopping(monitor='val_loss', patience=EARLY_STOP_PATIENCE)

# Probably we also need to subclass `System`, in order to log the target metrics on the validation set (PESQ/STOI)
system = System(model, optimizer, sisdr_loss_wrapper, train_loader, val_loader, scheduler)

# log dir and model name are also part of the config, of course
LOG_DIR = 'logs'
logger = pl_loggers.TensorBoardLogger(LOG_DIR, name='TIMIT-drones-ConvTasNet-random', version=1)

# choose the proper accelerator for JADE, probably `ddp` (also, `auto_select_gpus=True` might be useful)
trainer = Trainer(max_epochs=MAX_EPOCHS, gpus=-1,
                  logger=logger, callbacks=[early_stopping, checkpoint], deterministic=True, gradient_clip_val=5.0,)

trainer.fit(system)

torch.save(model.serialize(), 'conv_tasnet_model.pt')
Example #26
0
    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.batch_size)


if __name__ == "__main__":
    logger = TensorBoardLogger("lightning_logs", name="image_only")
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=5000,
                                        patience=7,
                                        verbose=False,
                                        mode="min")

    model = LitClassifier()
    trainer = pl.Trainer(gpus=1,
                         logger=logger,
                         early_stop_callback=early_stop_callback)

    lr_finder = trainer.lr_find(model)
    fig = lr_finder.plot(suggest=True, show=True)
    new_lr = lr_finder.suggestion()
    print(new_lr)
    model.hparams.lr = new_lr

    trainer.fit(model)
    args = parser.parse_args()
    dict_args = vars(args)

    if "accelerator" in dict_args:
        if dict_args["accelerator"] == "None":
            dict_args["accelerator"] = None

    model = LightningMNISTClassifier(**dict_args)

    dm = MNISTDataModule(**dict_args)
    dm.setup(stage="fit")

    early_stopping = EarlyStopping(
        monitor=dict_args["es_monitor"],
        mode=dict_args["es_mode"],
        verbose=dict_args["es_verbose"],
        patience=dict_args["es_patience"],
    )

    checkpoint_callback = ModelCheckpoint(dirpath=os.getcwd(),
                                          save_top_k=1,
                                          verbose=True,
                                          monitor="val_loss",
                                          mode="min")
    lr_logger = LearningRateMonitor()

    trainer = pl.Trainer.from_argparse_args(
        args,
        callbacks=[lr_logger, early_stopping, checkpoint_callback],
        checkpoint_callback=True)
    trainer.fit(model, dm)
def test_v1_6_0_early_stopping_monitor(tmpdir):
    with pytest.deprecated_call(
            match=
            r"The `EarlyStopping\(monitor\)` argument will be required starting in v1.6."
            " For backward compatibility, setting this to `early_stop_on`."):
        EarlyStopping()
Example #29
0
    #     'train_batch_size': 64,  # configurable
    #     'eval_batch_size': 64  # configurable
    # })

    args = argparse.Namespace(**args_dict)

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=args.output_dir,
        prefix="checkpoint",
        monitor="val_loss",
        mode="min",
        save_top_k=5)

    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00,
                                        patience=3,
                                        verbose=False,
                                        mode='min')

    # -------------------------- sanity check conll -------------------------- #
    tokenizer = T5Tokenizer.from_pretrained(
        args.tokenizer_name_or_path)  # t5-base | t5-small

    dataset = MyDataset(tokenizer,
                        args.data_dir,
                        'val',
                        max_len=args.max_seq_length)

    print('Length of dataset is {}'.format(len(dataset)))
    data = dataset[0]
    print(tokenizer.decode(data['source_ids'], skip_special_tokens=True))
Example #30
0
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None:
    """
    Retrieve and split data, train and evaluate a model, and save it.

    From the terminal, you can run this script with:

    .. code-block:: bash

        python collie/movielens/run.py  --epochs 20

    Parameters
    ----------
    epochs: int
        Number of epochs for model training
    gpus: int
        Number of gpus to train on

    """
    t = Timer()

    t.timecheck('  1.0 - retrieving MovieLens 100K dataset')
    df = read_movielens_df(decrement_ids=True)
    t.timecheck('  1.0 complete')

    t.timecheck('  2.0 - splitting data')
    df_imp = convert_to_implicit(df)
    interactions = Interactions(users=df_imp['user_id'],
                                items=df_imp['item_id'],
                                allow_missing_ids=True)
    train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1)
    train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True)
    val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False)
    t.timecheck('  2.0 complete')

    t.timecheck('  3.0 - training the model')
    model = MatrixFactorizationModel(train=train_loader,
                                     val=val_loader,
                                     dropout_p=0.05,
                                     loss='adaptive',
                                     lr=5e-2,
                                     embedding_dim=10,
                                     optimizer='adam',
                                     weight_decay=1e-7)
    trainer = CollieTrainer(model=model,
                            gpus=gpus,
                            max_epochs=epochs,
                            deterministic=True,
                            logger=False,
                            checkpoint_callback=False,
                            callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')],
                            weights_summary='full',
                            terminate_on_nan=True)
    trainer.fit(model)
    model.eval()
    t.timecheck('\n  3.0 complete')

    t.timecheck('  4.0 - evaluating model')
    auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10)
    print(f'AUC:          {auc_score}')
    print(f'MRR:          {mrr_score}')
    print(f'MAP@10:       {mapk_score}')
    t.timecheck('  4.0 complete')

    t.timecheck('  5.0 - saving model')
    absolute_data_path = DATA_PATH / 'fitted_model'
    model.save_model(absolute_data_path)
    t.timecheck('  5.0 complete')