def final_train(p, load=False): data_ = load_data(root_dir='./data/', mode='train') data, target, features, date = preprocess_data(data_, nn=True) input_size = data.shape[-1] output_size = 1 train_idx, val_idx = date[date <= 450].index.values.tolist(), date[ date > 450].index.values.tolist() data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True) data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath='models/full_train', monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) dataset = FinData(data, target, date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), 'models/final_train.pth') return model, features
def train(cfg): early_stop_callback = EarlyStopping(monitor=cfg.train.early_stop.loss, mode=cfg.train.early_stop.mode, patience=cfg.train.early_stop.patience) writer = MlflowWriter(EXPERIMENT_NAME) t5_dialogue_model = T5DialoguePlModel.load_from_checkpoint( '../../../outputs/2021-05-14/15-47-46/lightning_logs/version_0/checkpoints/epoch=105-step=25227.ckpt', max_epochs=1000, strict=False, cfg=cfg, writer=writer) trainer = pl.Trainer(gpus=1, accumulate_grad_batches=8, callbacks=[early_stop_callback]) trainer.fit(t5_dialogue_model) writer.log_artifact(os.path.join(os.getcwd(), '.hydra/config.yaml')) writer.log_artifact(os.path.join(os.getcwd(), '.hydra/hydra.yaml')) writer.log_artifact(os.path.join(os.getcwd(), '.hydra/overrides.yaml')) writer.log_artifact(os.path.join(os.getcwd(), 'main.log')) writer.set_terminated()
def pytorch_model_with_callback(patience): mlflow.pytorch.autolog() model = IrisClassification() early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience, verbose=True) checkpoint_callback = ModelCheckpoint( filepath=os.getcwd(), save_top_k=1, verbose=True, monitor="val_loss", mode="min", prefix="" ) trainer = pl.Trainer( max_epochs=NUM_EPOCHS * 2, callbacks=[early_stopping], checkpoint_callback=checkpoint_callback, ) trainer.fit(model) client = mlflow.tracking.MlflowClient() run = client.get_run(client.list_run_infos(experiment_id="0")[0].run_id) return trainer, run
def test_early_stop_callback(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks.early_stopping import EarlyStopping with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max') callbacks = [early_stop_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def train_ae_model(data_dict): p = { 'dim_1': 675, 'dim_2': 400, 'dim_3': 224, 'hidden': 162, 'activation': nn.ReLU, 'dropout': 0.2916447561918717, 'lr': 0.030272591341587315, 'recon_loss_factor': 0.4447516076774931, 'batch_size': 1252, 'loss_sup_ae': nn.MSELoss, 'loss_recon': nn.MSELoss, 'embedding': True } train_idx = np.where(data_dict['era'] < 110) val_idx = np.where(data_dict['era'] > 110) p['input_size'] = len(data_dict['features']) p['output_size'] = 1 dataset = utils.FinData(data=data_dict['data'], target=data_dict['target'], era=data_dict['era']) dataloaders = utils.create_dataloaders(dataset=dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) model = SupAE(p) es = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.005, mode='min') trainer = pl.Trainer(max_epochs=100, gpus=1, callbacks=[es]) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), f'./saved_models/trained/trained_ae.pth') return model
def train( backbone: str, checkpoint_path: Path, batch_size: int = 20, learning_rate: float = 1e-3, gpus: int = 1, resume_path: Optional[Path] = None, summarize: bool = False, auto_scale_batch: Optional[str] = None, auto_learning_rate: bool = False, stage: str = "train", ): Model = ModelClass[backbone] model = Model( batch_size=batch_size, learning_rate=learning_rate, want_summary=summarize ) data_module = HDRDataModule() trainer = pl.Trainer( gpus=gpus, auto_lr_find=auto_learning_rate, auto_scale_batch_size=auto_scale_batch, checkpoint_callback=True, callbacks=[ EarlyStopping(monitor="val_loss", patience=15), ModelCheckpoint( dirpath=Path(checkpoint_path) / backbone, save_last=True, monitor="val_loss", ), ], resume_from_checkpoint=resume_path, ) if stage == "train": trainer.fit(model, datamodule=data_module) trainer.test()
def main(config): logger = config.get_logger('train') logger.info(config.log_dir) tb_logger = TensorBoardLogger(save_dir=config.log_dir) # setup data_loader instances data_loader = config.init_obj('data_loader', module_data) # get function handles of loss and metrics criterion = getattr(module_loss, config['loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] # build model architecture, then print to console model = config.init_obj('arch', module_arch, criterion=criterion, metric_ftns=metrics, config=config) logger.info(model) early_stop_mode, early_stop_monitor = config['trainer']['monitor'].split( ' ') early_stop_callback = EarlyStopping( monitor=early_stop_monitor, min_delta=0.00, patience=config['trainer']['early_stop'], verbose=False, mode=early_stop_mode) logger.info(f'Resume from file: {config.resume}') trainer = pl.Trainer( gpus=config['n_gpu'], logger=tb_logger, callbacks=[early_stop_callback], limit_train_batches=config['trainer']['train_batches'], limit_val_batches=config['trainer']['val_batches'], limit_test_batches=config['trainer']['test_batches'], default_root_dir=config['trainer']['save_dir'], resume_from_checkpoint=config.resume) trainer.fit(model, data_loader)
def train_from_scratch(names_list: List[str], hparams:DotMap): dsrc = get_dataset(names_list) dls = dsrc.dataloaders(after_item=after_item, before_batch=pad_input_chunk_new, bs=32, n_inp=2) # get the model model = RNN(hparams, char2tensor = str(dict(dls.numericalize.o2i)), vocab=str(dls.numericalize.vocab)) checkpoint_callback = ModelCheckpoint( dirpath = './checkpoints', filename='{epoch}', save_top_k=3, monitor='val_loss', mode='min' ) trainer = pl.Trainer(fast_dev_run=False, auto_lr_find='learning_rate',gpus=1, callbacks=[EarlyStopping(monitor='val_loss',patience=5), checkpoint_callback], ) trainer.fit(model, dls.train, dls.valid) return trainer
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None): """ Init Bert model :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents indexed by language code. :param batch_size: int, number of samples per batch. :param nepochs: int, number of max epochs to train the model. :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping. :param n_jobs: int, number of concurrent workers. :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. """ super().__init__() self.multilingualIndex = multilingualIndex self.nepochs = nepochs self.gpus = gpus self.batch_size = batch_size self.n_jobs = n_jobs self.stored_path = stored_path self.model = self._init_model() self.patience = patience self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') # modifying EarlyStopping global var in order to compute >= with respect to the best score self.early_stop_callback.mode_dict['max'] = torch.ge
def test_pytorch_with_early_stopping_autolog_log_models_configuration_with( log_models, patience): mlflow.pytorch.autolog(log_models=log_models) model = IrisClassification() dm = IrisDataModule() dm.prepare_data() dm.setup(stage="fit") early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience, verbose=True) with TempDir() as tmp: keyword = "dirpath" if LooseVersion( pl.__version__) >= LooseVersion("1.2.0") else "filepath" checkpoint_callback = ModelCheckpoint( **{keyword: tmp.path()}, save_top_k=1, verbose=True, monitor="val_loss", mode="min", prefix="", ) trainer = pl.Trainer( max_epochs=NUM_EPOCHS * 2, callbacks=[early_stopping], checkpoint_callback=checkpoint_callback, ) trainer.fit(model, dm) client = mlflow.tracking.MlflowClient() run = client.get_run( client.list_run_infos(experiment_id="0")[0].run_id) run_id = run.info.run_id client = mlflow.tracking.MlflowClient() artifacts = [f.path for f in client.list_artifacts(run_id)] assert ("restored_model_checkpoint" in artifacts) == log_models
def get_callbacks(config, dm): #callbacks early_stopping = EarlyStopping(monitor='_valid_level0Accuracy', mode="max", patience=10, verbose=True, check_finite=True) checkpoint_callback = ModelCheckpoint( monitor='_val_loss', dirpath=config.PATH_CHECKPOINT, filename='-{epoch:02d}-{val_loss:.6f}', mode="min", save_last=True, save_top_k=3, ) learning_rate_monitor = LearningRateMonitor(logging_interval="epoch") accuracytest = AccuracyEnd(dm.test_dataloader()) plt_latent_space = PlotLatentSpace(dm.test_dataloader()) freeze_layers_name = config.freeze_layers_name freeze_layer_enum = FreezeLayersAvailable[freeze_layers_name.lower()] if freeze_layer_enum == FreezeLayersAvailable.none: callbacks = [ accuracytest, learning_rate_monitor, early_stopping, plt_latent_space, ] else: freeze_layers = FreezeLayers(freeze_layer_enum) callbacks = [ accuracytest, learning_rate_monitor, early_stopping, freeze_layers, plt_latent_space ] return callbacks
def test_early_stop_callback(self): self.skipTest('There is a deadlock bug for early stop call back. ' + 'Will add this test back when it is solved.') from pytorch_lightning.callbacks.early_stopping import EarlyStopping with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max') callbacks = [early_stop_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def pytorch_model_with_callback(patience): mlflow.pytorch.autolog() model = IrisClassification() dm = IrisDataModule() dm.prepare_data() dm.setup(stage="fit") early_stopping = EarlyStopping( monitor="val_loss", mode="min", min_delta=99999999, # forces early stopping patience=patience, verbose=True, ) with TempDir() as tmp: keyword = "dirpath" if LooseVersion( pl.__version__) >= LooseVersion("1.2.0") else "filepath" checkpoint_callback = ModelCheckpoint( **{keyword: tmp.path()}, save_top_k=1, verbose=True, monitor="val_loss", mode="min", prefix="", ) trainer = pl.Trainer( max_epochs=NUM_EPOCHS * 2, callbacks=[early_stopping], checkpoint_callback=checkpoint_callback, ) trainer.fit(model, dm) client = mlflow.tracking.MlflowClient() run = client.get_run( client.list_run_infos(experiment_id="0")[0].run_id) return trainer, run
def fit_actor_model_to_data(actor_model, transition_tensors: dict, hparams: dict): max_epochs = hparams["actor_model"]["max_epochs"] batch_size = hparams["actor_model"]["batch_size"] patience = hparams["actor_model"]["patience"] full_dataset = TransitionDataset(transition_tensors, ["state", "best_action"]) train_dataloader, valid_dataloader = get_train_and_valid_dataloaders( full_dataset, batch_size) callbacks = [ EarlyStopping(monitor="loss/valid", patience=patience), ] trainer = pl.Trainer(max_epochs=max_epochs, callbacks=callbacks, gpus=0, checkpoint_callback=False, logger=False) trainer.fit(actor_model, train_dataloader, valid_dataloader)
def create_callbacks(self, setting: SettingType) -> List[Callback]: """Create the PytorchLightning Callbacks for this Setting. These callbacks will get added to the Trainer in `create_trainer`. Parameters ---------- setting : SettingType The `Setting` on which this Method is going to be applied. Returns ------- List[Callback] A List of `Callaback` objects to use during training. """ # TODO: Move this to something like a `configure_callbacks` method in the model, # once PL adds it. # from sequoia.common.callbacks.vae_callback import SaveVaeSamplesCallback return [ EarlyStopping(monitor="val Loss") # self.hparams.knn_callback, # SaveVaeSamplesCallback(), ]
def train(hparams): rdm = RetinalDataModule() model = get_model(hparams) logger = TensorBoardLogger('logs', name=get_exp_name(hparams), default_hp_metric=False) # log hparams to tensorboard logger.log_hyperparams(hparams, { 'train_acc': 0, 'train_f1': 0, 'train_loss': 0, 'valid_acc': 0, 'valid_f1': 0, 'valid_loss': 0, }) trainer = pl.Trainer(gpus=1, min_epochs=50, max_epochs=hparams['n_epochs'], logger=logger, callbacks=[ EarlyStopping(monitor='valid_loss', patience=10, mode='min'), ModelCheckpoint(monitor='valid_loss') ]) trainer.fit(model, rdm)
weight_decay=0.0002097517651377327) return optimizer transform = transforms.Compose([ transforms.Resize(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = datasets.ImageFolder('./train_sorted', transform) esc = EarlyStopping( min_delta=0.00, patience=1, verbose=False, monitor='val_loss', mode='min', ) # クロスバリデーション # batch_size = 128 # n_trainval = int(len(dataset) * 0.8) # n_test = len(dataset) - n_trainval # trainval, test = torch.utils.data.random_split(dataset, [n_trainval, n_test]) # test_loader = torch.utils.data.DataLoader(test, batch_size, shuffle=False, num_workers=16) # net = ClassNet() # kf = KFold(n_splits=5, shuffle=True)
def main(args: DictConfig): # Distributed training torch.multiprocessing.set_sharing_strategy('file_system') if str(args.exp.gpus) == '-1': args.exp.gpus = torch.cuda.device_count() # Secondary data args args.data.setting = 'in-topic' if args.data.test_id is None else 'cross-topic' dataset_name = args.data.path.split('/')[1] args.data.path = f'{ROOT_PATH}/{args.data.path}' # MlFlow Logging if args.exp.logging: experiment_name = f'{dataset_name}/{args.setting}-{args.data.setting}/{args.exp.task_name}' mlf_logger = MLFlowLogger(experiment_name=experiment_name, tracking_uri=MLFLOW_URI) experiment = mlf_logger._mlflow_client.get_experiment_by_name( experiment_name) if experiment is not None: experiment_id = experiment.experiment_id if args.exp.check_exisisting_hash: args.hash = calculate_hash(args) existing_runs = mlf_logger._mlflow_client.search_runs( filter_string=f"params.hash = '{args.hash}'", run_view_type=mlflow.tracking.client.ViewType.ACTIVE_ONLY, experiment_ids=[experiment_id]) if len(existing_runs) > 0: logger.info('Skipping existing run.') return else: logger.info('No runs found - perfoming one.') # cpnt_path = f'{ROOT_PATH}/mlruns/{experiment_id}/{run_id}/artifacts' # else: # cpnt_path = None # Load pretrained model and tokenizer set_seed(args) model = instantiate(args.lightning_module, args=args) logger.info(f'Run arguments: \n{args.pretty()}') # Early stopping & Checkpointing early_stop_callback = EarlyStopping( min_delta=0.00, patience=args.exp.early_stopping_patience, verbose=False, mode='min') checkpoint_callback = CustomModelCheckpoint( model=model, verbose=True, mode='min', save_top_k=1, period=0 if args.exp.val_check_interval < 1.0 else 1) lr_logging_callback = LearningRateLogger(logging_interval='epoch') # Training trainer = Trainer( gpus=eval(str(args.exp.gpus)), logger=mlf_logger if args.exp.logging else None, max_epochs=args.exp.max_epochs, gradient_clip_val=args.optimizer.max_grad_norm, early_stop_callback=early_stop_callback, val_check_interval=args.exp.val_check_interval, checkpoint_callback=checkpoint_callback if args.exp.checkpoint else None, accumulate_grad_batches=args.exp.gradient_accumulation_steps, auto_lr_find=args.optimizer.auto_lr_find, precision=args.exp.precision, distributed_backend='dp', callbacks=[lr_logging_callback]) trainer.fit(model) trainer.test(model) # Cleaning cache torch.cuda.empty_cache() # Ending the run if args.exp.logging: mlf_logger.finalize()
def cli_main(): parser = ArgumentParser() parser.add_argument("--DATA_PATH", type=str, help="path to folders with images to train on.") parser.add_argument("--VAL_PATH", type=str, default=None, help="path to validation folders with images") parser.add_argument( "--model", type=str, help= "model to initialize. Can accept model checkpoint or just encoder name from models.py" ) parser.add_argument("--batch_size", default=128, type=int, help="batch size for SSL") parser.add_argument("--cpus", default=1, type=int, help="number of cpus to use to fetch data") parser.add_argument( "--hidden_dim", default=128, type=int, help= "hidden dimensions in projection head or classification layer for finetuning" ) parser.add_argument("--epochs", default=400, type=int, help="number of epochs to train model") parser.add_argument("--learning_rate", default=1e-3, type=float, help="learning rate for encoder") parser.add_argument( "--patience", default=-1, type=int, help= "automatically cuts off training if validation does not drop for (patience) epochs. Leave blank to have no validation based early stopping." ) parser.add_argument( "--val_split", default=0.2, type=float, help="percent in validation data. Ignored if VAL_PATH specified") parser.add_argument( "--withhold_split", default=0, type=float, help= "decimal from 0-1 representing how much of the training data to withold from either training or validation. Used for experimenting with labels neeeded" ) parser.add_argument("--gpus", default=1, type=int, help="number of gpus to use for training") parser.add_argument("--log_name", type=str, default=None, help="name of model to log on wandb and locally") parser.add_argument("--image_size", default=256, type=int, help="height of square image") parser.add_argument( "--resize", default=False, type=bool, help= "Pre-Resize data to right shape to reduce cuda memory requirements of reading large images" ) parser.add_argument("--technique", default=None, type=str, help="SIMCLR, SIMSIAM or CLASSIFIER") parser.add_argument("--seed", default=1729, type=int, help="random seed for run for reproducibility") #add ability to parse unknown args args, _ = parser.parse_known_args() technique = supported_techniques[args.technique] args, _ = technique.add_model_specific_args(parser).parse_known_args() #logging wandb_logger = None log_name = args.technique + '_' + args.log_name + '.ckpt' if log_name is not None: wandb_logger = WandbLogger(name=log_name, project='Curator') #resize images here if args.resize: #implement resize and modify args.DATA_PATH accordingly pass #Splitting Data into train and validation if not (os.path.isdir(f"{args.DATA_PATH}/train") and os.path.isdir(f"{args.DATA_PATH}/val") ) and args.val_split != 0 and args.VAL_PATH is None: print( colored( f'Automatically splitting data into train and validation data...', 'blue')) shutil.rmtree(f'./split_data_{log_name[:-5]}', ignore_errors=True) splitfolders.ratio(args.DATA_PATH, output=f'./split_data_{log_name[:-5]}', ratio=(1 - args.val_split - args.withhold_split, args.val_split, args.withhold_split), seed=args.seed) args.DATA_PATH = f'./split_data_{log_name[:-5]}/train' args.VAL_PATH = f'./split_data_{log_name[:-5]}/val' model = load_model(args) print(colored("Model architecture successfully loaded", 'blue')) cbs = [] backend = 'ddp' if args.patience > 0: cb = EarlyStopping('val_loss', patience=args.patience) cbs.append(cb) trainer = pl.Trainer( gpus=args.gpus, max_epochs=args.epochs, progress_bar_refresh_rate=20, callbacks=cbs, distributed_backend=f'{backend}' if args.gpus > 1 else None, sync_batchnorm=True if args.gpus > 1 else False, logger=wandb_logger, enable_pl_optimizer=True) trainer.fit(model) Path(f"./models/").mkdir(parents=True, exist_ok=True) trainer.save_checkpoint(f"./models/{log_name}") print(colored("YOUR MODEL CAN BE ACCESSED AT: ", 'blue'), f"./models/{log_name}")
def get_early_stopping_callback(): return EarlyStopping( monitor='train_loss', patience=40, mode='min', )
pin_memory=True, num_workers=4, shuffle=True, ) def val_dataloader(self): return DataLoader( PAWS_X("x-final/ko/dev_2k.tsv", "ko_KR", "ko_KR", 128), num_workers=4, batch_size=4, pin_memory=True, ) if __name__ == "__main__": # trainer = pl.Trainer(gpus=None) trainer = pl.Trainer( gpus=1, callbacks=[ EarlyStopping(monitor="val_loss"), ModelCheckpoint( monitor="val_loss", filename="paraphrase_mbart_{epoch:02d}-{val_loss:.2f}", save_top_k=1, mode="min", ), ] ) model = BartForSeq2SeqLM("ko_KR", "ko_KR") trainer.fit(model)
def train_model(args): # do not run this test for pytorch lightning below min supported verson import pytorch_lightning as pl if LooseVersion(pl.__version__) < LooseVersion(MIN_PL_VERSION): print("Skip test for pytorch_ligthning=={}, min support version is {}".format(pl.__version__, MIN_PL_VERSION)) return # Initialize SparkSession conf = SparkConf().setAppName('pytorch_spark_mnist').set('spark.sql.shuffle.partitions', '16') if args.master: conf.setMaster(args.master) elif args.num_proc: conf.setMaster('local[{}]'.format(args.num_proc)) spark = SparkSession.builder.config(conf=conf).getOrCreate() # Setup our store for intermediate data store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoder(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Define the PyTorch model without any Horovod-specific parameters class Net(LightningModule): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = x.float().reshape((-1, 1, 28, 28)) x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x, -1) def configure_optimizers(self): return optim.SGD(self.parameters(), lr=0.01, momentum=0.5) def training_step(self, batch, batch_idx): if batch_idx == 0: print(f"training data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('train_loss', loss) return loss def validation_step(self, batch, batch_idx): if batch_idx == 0: print(f"validation data batch size: {batch['label'].shape}") x, y = batch['features'], batch['label'] y_hat = self(x) loss = F.nll_loss(y_hat, y.long()) self.log('val_loss', loss) def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() if len(outputs) > 0 else float('inf') self.log('avg_val_loss', avg_loss) model = Net() # Train a Horovod Spark Estimator on the DataFrame backend = SparkBackend(num_proc=args.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True) from pytorch_lightning.callbacks import Callback epochs = args.epochs class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 self.validation_epoch_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A train or eval epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_validation_epoch_end(self, trainer, model, unused=None): print('A val epoch ended.') self.validation_epoch_end_counter += 1 def on_train_end(self, trainer, model): print("Training ends:" f"epcoh_end_counter={self.epcoh_end_counter}, " f"train_epcoh_end_counter={self.train_epcoh_end_counter}, " f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n") assert self.train_epcoh_end_counter <= epochs assert self.epcoh_end_counter == self.train_epcoh_end_counter + self.validation_epoch_end_counter callbacks = [MyDummyCallback()] # added EarlyStopping and ModelCheckpoint from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint callbacks.append(ModelCheckpoint(monitor='val_loss', mode="min", save_top_k=1, verbose=True)) from pytorch_lightning.callbacks.early_stopping import EarlyStopping callbacks.append(EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=True, mode='min')) torch_estimator = hvd.TorchEstimator(backend=backend, store=store, model=model, input_shapes=[[-1, 1, 28, 28]], feature_cols=['features'], label_cols=['label'], batch_size=args.batch_size, epochs=args.epochs, validation=0.1, verbose=1, callbacks=callbacks, profiler="simple" if args.enable_profiler else None) torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = torch_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()
cp_save_dir = os.path.join(os.getcwd(), "CKP", model_file_name) logger = TensorBoardLogger( save_dir=tb_save_dir, name=model_file_name ) checkpoint_callback = ModelCheckpoint( filepath=cp_save_dir, save_top_k=1, verbose=True, monitor='loss_val', mode='min' ) early_stop_callback = EarlyStopping(monitor='loss_val', verbose=True, mode=min) trainer = pl.Trainer(gpus=1, max_epochs=hparams["max_epochs"], weights_summary=None, logger=logger, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback]) trainer.fit(model, train_dataloader, val_dataloader) print("Best Model Path", checkpoint_callback.best_model_path) best_model_path = checkpoint_callback.best_model_path print(trainer.test(model, test_dataloaders=test_dataloader)) model = model.to('cpu')
features="coco-bottomup-36", dir_data=args.dir_data, min_ans_occ=9, ) model = SimpleVQAModel( answers=vqa_train.answers, train_dataset=vqa_train, val_dataset=vqa_val, dir_data=args.dir_data, ) trainer = pl.Trainer( gpus=1, default_root_dir=args.root_dir, max_epochs=50, callbacks=[EarlyStopping(monitor="Accuracy/val_acc_overall")], ) trainer.fit( model, DataLoader( vqa_train, batch_size=args.batch_size, collate_fn=VQA2.collate_fn, num_workers=args.num_workers, shuffle=True, ), DataLoader( vqa_val, batch_size=args.batch_size, collate_fn=VQA2.collate_fn,
# the model here should be constructed in the script accordingly to the passed config (including the model type) # most of the models accept `sample_rate` parameter for encoders, which is important (default is 16000, override) #model = DCUNet("DCUNet-20", fix_length_mode="trim", sample_rate=SAMPLE_RATE) model = ConvTasNet(n_src=1) checkpoint = ModelCheckpoint( filename='{epoch:02d}-{val_loss:.2f}', monitor="val_loss", mode="min", save_top_k=5, verbose=True ) optimizer = optim.Adam(model.parameters(), lr=LR) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=REDUCE_LR_PATIENCE) early_stopping = EarlyStopping(monitor='val_loss', patience=EARLY_STOP_PATIENCE) # Probably we also need to subclass `System`, in order to log the target metrics on the validation set (PESQ/STOI) system = System(model, optimizer, sisdr_loss_wrapper, train_loader, val_loader, scheduler) # log dir and model name are also part of the config, of course LOG_DIR = 'logs' logger = pl_loggers.TensorBoardLogger(LOG_DIR, name='TIMIT-drones-ConvTasNet-random', version=1) # choose the proper accelerator for JADE, probably `ddp` (also, `auto_select_gpus=True` might be useful) trainer = Trainer(max_epochs=MAX_EPOCHS, gpus=-1, logger=logger, callbacks=[early_stopping, checkpoint], deterministic=True, gradient_clip_val=5.0,) trainer.fit(system) torch.save(model.serialize(), 'conv_tasnet_model.pt')
def train_dataloader(self): return DataLoader(self.train_set, batch_size=self.batch_size) def val_dataloader(self): return DataLoader(self.val_set, batch_size=self.batch_size) def test_dataloader(self): return DataLoader(self.test_set, batch_size=self.batch_size) if __name__ == "__main__": logger = TensorBoardLogger("lightning_logs", name="image_only") early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=5000, patience=7, verbose=False, mode="min") model = LitClassifier() trainer = pl.Trainer(gpus=1, logger=logger, early_stop_callback=early_stop_callback) lr_finder = trainer.lr_find(model) fig = lr_finder.plot(suggest=True, show=True) new_lr = lr_finder.suggestion() print(new_lr) model.hparams.lr = new_lr trainer.fit(model)
args = parser.parse_args() dict_args = vars(args) if "accelerator" in dict_args: if dict_args["accelerator"] == "None": dict_args["accelerator"] = None model = LightningMNISTClassifier(**dict_args) dm = MNISTDataModule(**dict_args) dm.setup(stage="fit") early_stopping = EarlyStopping( monitor=dict_args["es_monitor"], mode=dict_args["es_mode"], verbose=dict_args["es_verbose"], patience=dict_args["es_patience"], ) checkpoint_callback = ModelCheckpoint(dirpath=os.getcwd(), save_top_k=1, verbose=True, monitor="val_loss", mode="min") lr_logger = LearningRateMonitor() trainer = pl.Trainer.from_argparse_args( args, callbacks=[lr_logger, early_stopping, checkpoint_callback], checkpoint_callback=True) trainer.fit(model, dm)
def test_v1_6_0_early_stopping_monitor(tmpdir): with pytest.deprecated_call( match= r"The `EarlyStopping\(monitor\)` argument will be required starting in v1.6." " For backward compatibility, setting this to `early_stop_on`."): EarlyStopping()
# 'train_batch_size': 64, # configurable # 'eval_batch_size': 64 # configurable # }) args = argparse.Namespace(**args_dict) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=False, mode='min') # -------------------------- sanity check conll -------------------------- # tokenizer = T5Tokenizer.from_pretrained( args.tokenizer_name_or_path) # t5-base | t5-small dataset = MyDataset(tokenizer, args.data_dir, 'val', max_len=args.max_seq_length) print('Length of dataset is {}'.format(len(dataset))) data = dataset[0] print(tokenizer.decode(data['source_ids'], skip_special_tokens=True))
def run_movielens_example(epochs: int = 20, gpus: int = 0) -> None: """ Retrieve and split data, train and evaluate a model, and save it. From the terminal, you can run this script with: .. code-block:: bash python collie/movielens/run.py --epochs 20 Parameters ---------- epochs: int Number of epochs for model training gpus: int Number of gpus to train on """ t = Timer() t.timecheck(' 1.0 - retrieving MovieLens 100K dataset') df = read_movielens_df(decrement_ids=True) t.timecheck(' 1.0 complete') t.timecheck(' 2.0 - splitting data') df_imp = convert_to_implicit(df) interactions = Interactions(users=df_imp['user_id'], items=df_imp['item_id'], allow_missing_ids=True) train, val, test = stratified_split(interactions, val_p=0.1, test_p=0.1) train_loader = InteractionsDataLoader(train, batch_size=1024, shuffle=True) val_loader = InteractionsDataLoader(val, batch_size=1024, shuffle=False) t.timecheck(' 2.0 complete') t.timecheck(' 3.0 - training the model') model = MatrixFactorizationModel(train=train_loader, val=val_loader, dropout_p=0.05, loss='adaptive', lr=5e-2, embedding_dim=10, optimizer='adam', weight_decay=1e-7) trainer = CollieTrainer(model=model, gpus=gpus, max_epochs=epochs, deterministic=True, logger=False, checkpoint_callback=False, callbacks=[EarlyStopping(monitor='val_loss_epoch', mode='min')], weights_summary='full', terminate_on_nan=True) trainer.fit(model) model.eval() t.timecheck('\n 3.0 complete') t.timecheck(' 4.0 - evaluating model') auc_score, mrr_score, mapk_score = evaluate_in_batches([auc, mrr, mapk], test, model, k=10) print(f'AUC: {auc_score}') print(f'MRR: {mrr_score}') print(f'MAP@10: {mapk_score}') t.timecheck(' 4.0 complete') t.timecheck(' 5.0 - saving model') absolute_data_path = DATA_PATH / 'fitted_model' model.save_model(absolute_data_path) t.timecheck(' 5.0 complete')