def train_mnist(config, data_dir=None, num_epochs=10, num_workers=1, use_gpu=False, callbacks=None): # Make sure data is downloaded on all nodes. def download_data(): from filelock import FileLock with FileLock(os.path.join(data_dir, ".lock")): MNISTDataModule(data_dir=data_dir).prepare_data() model = LightningMNISTClassifier(config, data_dir) callbacks = callbacks or [] trainer = pl.Trainer(max_epochs=num_epochs, callbacks=callbacks, progress_bar_refresh_rate=0, plugins=[ RayPlugin(num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data) ]) dm = MNISTDataModule(data_dir=data_dir, num_workers=1, batch_size=config["batch_size"]) trainer.fit(model, dm)
def test_distributed_sampler(tmpdir, ray_start_2_cpus): """Tests if distributed sampler is properly set.""" model = BoringModel() train_dataloader = model.train_dataloader() initial_sampler = train_dataloader.sampler assert not isinstance(initial_sampler, DistributedSampler) class DistributedSamplerCallback(Callback): def on_train_start(self, trainer, pl_module): train_sampler = trainer.train_dataloader.sampler assert isinstance(train_sampler, DistributedSampler) assert train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_validation_start(self, trainer, pl_module): train_sampler = trainer.val_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank def on_test_start(self, trainer, pl_module): train_sampler = trainer.test_dataloaders[0].sampler assert isinstance(train_sampler, DistributedSampler) assert not train_sampler.shuffle assert train_sampler.num_replicas == 2 assert train_sampler.rank == trainer.global_rank plugin = RayPlugin(num_workers=2) trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[DistributedSamplerCallback()]) trainer.fit(model)
def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) parser.add_argument("--ray_accelerator_num_workers", type=int, default=4) parser.add_argument("--ray_accelerator_cpus_per_worker", type=int, default=1) parser.add_argument("--use_gpu", type=bool, default=False) parser = LitMNIST.add_model_specific_args(parser) parser = MNISTDataModule.add_argparse_args(parser) args = parser.parse_args() ray.init(address='auto') print('''This cluster consists of {} nodes in total {} CPU resources in total '''.format(len(ray.nodes()), ray.cluster_resources()['CPU'])) # ------------ # data # ------------ dm = MNISTDataModule(data_dir='', val_split=5000, num_workers=1, normalize=False, seed=42, batch_size=32) # ------------ # model # ------------ model = LitMNIST(args.hidden_dim, args.learning_rate) # ------------ # training # ------------ # ray.init() plugin = RayPlugin( num_workers=args.ray_accelerator_num_workers, num_cpus_per_worker=args.ray_accelerator_cpus_per_worker, use_gpu=args.use_gpu) trainer = pl.Trainer(gpus=args.gpus, precision=args.precision, plugins=[plugin], max_epochs=args.max_epochs) trainer.fit(model, datamodule=dm) # ------------ # testing # ------------ result = trainer.test(model, datamodule=dm) pprint(result)
def test_multi_node(tmpdir): """Tests if multi-node GPU training works.""" ray.init("auto") num_gpus = ray.available_resources()["GPU"] model = BoringModel() plugin = RayPlugin(num_workers=num_gpus, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True) train_test(trainer, model)
def trainable_args_update_method(self, trainable_args: dict) -> dict: from ray_lightning import RayPlugin trainable_args['hyperparameters']['env.num_gpus'] = self.gpu_per_job trainable_args['hyperparameters']['env.num_workers'] = self.cpu_per_job trainable_args['hyperparameters'][ 'env.num_nodes'] = 1 # num_nodes is not needed by ray lightning. Setting it to default, which is 1 trainable_args['_ray_lightning_plugin'] = RayPlugin( num_workers=self.num_workers, num_cpus_per_worker=self.cpu_per_worker, use_gpu=self.gpu_per_job is not None, ) return trainable_args
def test_unused_parameters(tmpdir, ray_start_2_cpus): """Tests if find_unused_parameters is properly passed to model.""" model = BoringModel() plugin = RayPlugin( num_workers=2, use_gpu=False, find_unused_parameters=False) class UnusedParameterCallback(Callback): def on_train_start(self, trainer, pl_module): assert trainer.model.find_unused_parameters is False trainer = get_trainer( tmpdir, plugins=[plugin], callbacks=[UnusedParameterCallback()]) trainer.fit(model)
def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers): """Tests whether the appropriate number of training actors are created.""" model = BoringModel() def check_num_actor(): assert len(ray.actors()) == num_workers model.on_epoch_end = check_num_actor plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) trainer.fit(model) assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD for actor in list(ray.actors().values()))
def test_model_to_gpu(tmpdir, ray_start_2_gpus): """Tests if model is placed on CUDA device.""" model = BoringModel() class CheckGPUCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert next(pl_module.parameters()).is_cuda plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=[plugin], use_gpu=True, callbacks=[CheckGPUCallback()]) trainer.fit(model)
def train_mnist(config, data_dir=None, num_epochs=10, num_workers=1, use_gpu=False, callbacks=None): model = MNISTClassifier(config, data_dir) callbacks = callbacks or [] trainer = pl.Trainer( max_epochs=num_epochs, gpus=int(use_gpu), callbacks=callbacks, plugins=[RayPlugin(num_workers=num_workers, use_gpu=use_gpu)]) trainer.fit(model)
def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers): """Tests if trained model has high accuracy on test set.""" config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin]) predict_test(trainer, model, dm)
def test_early_stop(tmpdir, ray_start_2_cpus): """Tests if early stopping callback works correctly.""" model = BoringModel() plugin = RayPlugin(num_workers=1, use_gpu=False) early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True) trainer = get_trainer( tmpdir, max_epochs=500, plugins=[plugin], callbacks=[early_stop], limit_train_batches=1.0, limit_val_batches=1.0, progress_bar_refresh_rate=1) trainer.fit(model) trained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) assert trained_model.val_epoch == 2, trained_model.val_epoch
def test_correct_devices(tmpdir, ray_start_2_gpus): """Tests if GPU devices are correctly set.""" model = BoringModel() class CheckDevicesCallback(Callback): def on_epoch_end(self, trainer, pl_module): assert trainer.root_gpu == 0 assert int(os.environ["CUDA_VISIBLE_DEVICES"]) == \ trainer.local_rank assert trainer.root_gpu == pl_module.device.index assert torch.cuda.current_device() == trainer.root_gpu plugin = RayPlugin(num_workers=2, use_gpu=True) trainer = get_trainer(tmpdir, plugins=plugin, use_gpu=True, callbacks=[CheckDevicesCallback()]) trainer.fit(model)
def train_mnist(config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_workers=1, use_gpu=False, callbacks=None, **trainer_kwargs): model = MNISTClassifier(config, data_dir) callbacks = callbacks or [] trainer = pl.Trainer( max_epochs=num_epochs, callbacks=callbacks, plugins=[RayPlugin(num_workers=num_workers, use_gpu=use_gpu)], **trainer_kwargs) trainer.fit(model)
def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed, num_workers): assert ray.util.client.ray.is_connected() config = { "layer_1": 32, "layer_2": 32, "lr": 1e-2, "batch_size": 32, } model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule(data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin]) predict_test(trainer, model, dm)
def test_checkpoint_ddp_gpu(tmpdir, ray_start_4_cpus): """Tests if Tune checkpointing works with RayAccelerator.""" plugin = RayPlugin(num_workers=2, use_gpu=False) checkpoint_test(tmpdir, plugin)
def test_tune_iteration_ddp(tmpdir, ray_start_4_cpus): """Tests if each RayPlugin runs the correct number of iterations.""" plugin = RayPlugin(num_workers=2, use_gpu=False) tune_test(tmpdir, plugin)
def main(args: Namespace) -> None: """ The executable logic for this controller. For the training loop: - Instantiates a data object using ``cp_tokenized_data.QuackTokenizedDataModule``. - Instantiates a model using ``autoencoder.QuackAutoEncoder``. - Instantiates a strategy plugin using ``ray_lightning.ray_ddp.RayPlugin``. - Instantiates callback objects: - A logger using ``pytorch_lightning.loggers.comet.CometLogger`` - A learning rate monitor using ``pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor`` - A checkpoint creator using ``pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`` - An early stopping monitor using ``pytorch_lightning.callbacks.early_stopping.EarlyStopping`` Then using these objects, instantiates a training control object using ``pytorch_lightning.trainer.trainer.Trainer`` For inference with a trained model, just the logger and the ray strategy are used along with an instance of autoencoder.AutoencoderWriter which when composed with Trainer prepares the prediction loop to output its results to file on each iteration. Parameters ---------- args: Namespace Command line arguments. Possible arguments are: --data_dir *str* default='./data' The top directory of the data storage tree. --batch_size *int* default=4 The batch size used for processing data. --num_workers *int* default=0 The number of worker processes used by the data loader. --embed_size *int* default=128 Hyperparameter passed to QuackAutoEncoder. --hidden_size *int* default=512 Hyperparameter passed to QuackAutoEncoder. --encode *bool* Flag to run the inference loop instead of train. True when present, otherwise False --filtered *bool* Flag to output labeled data from the inference loop. True when present, otherwise False --evaluate *bool* Flag to output undetermined data from the inference loop. True when present, otherwise False --checkpoint_path *str* A checkpoint used for manual restart. Only the weights are used. --storage_path *str* default='./data/encoded' A path for storing the outputs from inference. --l_rate *float* default=1e-1 Hyperparameter passed to QuackAutoEncoder. --l_rate_min *float* default=1e-3 Hyperparameter passed to QuackAutoEncoder. --l_rate_max_epoch *int* default=-1 Hyperparameter passed to QuackAutoEncoder. --exp_label *str* default='autoencoder-train' Label passed to the logger. --ray_nodes *int* default=4 Number of parallel nodes passed to the Ray plugin. Returns ------- void """ data = QuackTokenizedDataModule(args.data_dir, batch_size=args.batch_size, workers=args.num_workers) # Max value of static is from the ipv4 segments. max_index = 256 + QuackConstants.VOCAB.value model = QuackAutoEncoder(num_embeddings=max_index, embed_size=args.embed_size, hidden_size=args.hidden_size, max_decode_length=data.get_width(), learning_rate=args.l_rate, learning_rate_min=args.l_rate_min, lr_max_epochs=args.l_rate_max_epoch) if args.checkpoint_path is not None: model = QuackAutoEncoder.load_from_checkpoint( args.checkpoint_path, learning_rate=args.l_rate, learning_rate_min=args.l_rate_min, lr_max_epochs=args.l_rate_max_epoch) ray_plugin = RayPlugin(num_workers=args.ray_nodes, num_cpus_per_worker=1, use_gpu=False, find_unused_parameters=False) date_time = strftime("%d %b %Y %H:%M", gmtime()) device_logger = DeviceStatsMonitor() checkpoint_storage = Path(args.storage_path) checkpoint_storage.mkdir(parents=True, exist_ok=True) # API configuration for comet: https://www.comet.ml/docs/python-sdk/advanced/#python-configuration comet_logger = CometLogger( project_name="censored-planet", experiment_name=f'{args.exp_label}: {date_time}', ) if args.encode: source_meta = Path(args.data_dir + '/metadata.pyc') try: with source_meta.open(mode='rb') as retrieved_dict: source_metadata = pickle.load(retrieved_dict) reduction_factor = source_metadata['censored'] / source_metadata[ 'uncensored'] except (OSError, KeyError): reduction_factor = 1 writer_callback = AutoencoderWriter( write_interval='batch', storage_path=args.storage_path, filtered=args.filtered, evaluate=args.evaluate, reduction_threshold=reduction_factor) trainer = Trainer.from_argparse_args( args, logger=comet_logger, strategy=ray_plugin, callbacks=[writer_callback, device_logger]) model.freeze() print('Ready for inference...') trainer.predict(model, datamodule=data, return_predictions=False) return else: lr_monitor = LearningRateMonitor(logging_interval='epoch') checkpoint_callback = ModelCheckpoint( monitor="val_loss", save_top_k=3, save_last=True, mode='min', every_n_train_steps=2000, auto_insert_metric_name=True, filename='autoenc_checkpoint_{epoch:02d}-{step}-{val_loss:02.2f}', dirpath=checkpoint_storage) early_stopping_callback = EarlyStopping( monitor="val_loss", patience=10, stopping_threshold=200, check_finite= True, # Stops training if the monitored metric becomes NaN or infinite. ) trainer = Trainer.from_argparse_args( args, logger=comet_logger, callbacks=[ early_stopping_callback, checkpoint_callback, device_logger, lr_monitor ], plugins=[ray_plugin], weights_save_path=checkpoint_storage) print('Ready for training...') trainer.fit(model, datamodule=data) print('Post fit testing...') trainer.test(model, datamodule=data)
def test_load(tmpdir, ray_start_2_cpus, num_workers): """Tests if model checkpoint can be loaded.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers, use_gpu=False) trainer = get_trainer(tmpdir, plugins=[plugin]) load_test(trainer, model)
def test_train(tmpdir, ray_start_2_cpus, num_workers): """Tests if training modifies model weights.""" model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
# load the dataset dataset = WineQualityDataset("winequality-red.csv") # calculate split train, test = dataset.get_splits() # prepare data loaders train_dl = DataLoader(train, batch_size=32, shuffle=True) test_dl = DataLoader(test, batch_size=32, shuffle=False) # Train the model model = WineQualityModel(11) # define the optimization start = time.time() # train plugin = RayPlugin(num_workers=6) trainer = Trainer(max_steps=1000, plugins=[plugin]) trainer.fit(model, train_dl) print(f"Build model in {time.time() - start}") print(model) # evaluate a model predictions, actuals = list(), list() for i, (inputs, targets) in enumerate(test_dl): # evaluate the model on the test set yhat = model(inputs) # retrieve numpy array yhat = yhat.detach().numpy() actual = targets.numpy() actual = actual.reshape((len(actual), 1)) # round to class values yhat = yhat.round()
def test_train_client(tmpdir, start_ray_client_server_2_cpus, num_workers): assert ray.util.client.ray.is_connected() model = BoringModel() plugin = RayPlugin(num_workers=num_workers) trainer = get_trainer(tmpdir, plugins=[plugin]) train_test(trainer, model)
def main(args: Namespace) -> None: """ The executable logic for this controller. For the training loop: - Instantiates a data object using ``cp_latent_data.QuackLatentDataModule``. - Instantiates a model using ``cp_latent_classifier.QuackLatentClassifier``. - Instantiates a strategy plugin using ``ray_lightning.ray_ddp.RayPlugin``. - Instantiates callback objects: - A logger using ``pytorch_lightning.loggers.comet.CometLogger`` - A learning rate monitor using ``pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor`` - A checkpoint creator using ``pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`` - An early stopping monitor using ``pytorch_lightning.callbacks.early_stopping.EarlyStopping`` Then using these objects, instantiates a training control object using ``pytorch_lightning.trainer.trainer.Trainer`` For inference with a trained model, just the logger and the ray strategy are used along with an instance of ``densenet.CensoredDataWriter`` which when composed with Trainer prepares the prediction loop to output its results to file on each iteration. Parameters ---------- args: Namespace Command line arguments. Possible arguments are: --data_dir *str* default='./data' The top directory of the data storage tree. --batch_size *int* default=4 The batch size used for processing data. --num_workers *int* default=0 The number of worker processes used by the data loader. --evaluate *bool* Flag to output undetermined data from the inference loop. True when present, otherwise False --checkpoint_path *str* A checkpoint used for manual restart. Only the weights are used. --storage_path *str* default='./data/encoded' A path for storing the outputs from inference. --l_rate *float* default=1e-1 Hyperparameter passed to QuackAutoEncoder. --l_rate_min *float* default=1e-3 Hyperparameter passed to QuackAutoEncoder. --l_rate_max_epoch *int* default=-1 Hyperparameter passed to QuackAutoEncoder. --exp_label *str* default='autoencoder-train' Label passed to the logger. --ray_nodes *int* default=4 Number of parallel nodes passed to the Ray plugin. --freeze *bool* Flag to construct so that the image analyzing layers of the pre-trained Densenet are frozen for training. Returns ------- void """ data = QuackLatentDataModule( args.data_dir, batch_size=args.batch_size, workers=args.num_workers ) model = QuackLatentClassifier( initial_size=256, learning_rate=args.l_rate, learning_rate_min=args.l_rate_min, lr_max_epochs=args.l_rate_max_epoch ) if args.checkpoint_path is not None: model = QuackLatentClassifier.load_from_checkpoint( args.checkpoint_path, initial_size=256, ) ray_plugin = RayPlugin( num_workers=args.ray_nodes, num_cpus_per_worker=1, use_gpu=False, find_unused_parameters=False ) date_time = strftime("%d %b %Y %H:%M", gmtime()) device_logger = DeviceStatsMonitor() checkpoint_storage = Path(args.storage_path) checkpoint_storage.mkdir(parents=True, exist_ok=True) # API configuration for comet: https://www.comet.ml/docs/python-sdk/advanced/#python-configuration comet_logger = CometLogger( project_name="censored-planet", experiment_name=f'{args.exp_label}: {date_time}', ) if args.predict: writer_callback = CensoredDataWriter( write_interval='batch', storage_path=args.storage_path ) trainer = Trainer.from_argparse_args( args, logger=comet_logger, callbacks=[writer_callback, device_logger], strategy=ray_plugin, ) model.freeze() print('Ready for inference...') trainer.predict(model, datamodule=data, return_predictions=False) return else: lr_monitor = LearningRateMonitor(logging_interval='epoch') checkpoint_callback = ModelCheckpoint( monitor="val_loss", mode='min', save_top_k=3, save_last=True, auto_insert_metric_name=True, filename='latent_checkpoint-{step}-{val_loss:02.2f}', dirpath=checkpoint_storage, ) early_stopping_callback = EarlyStopping( monitor="val_loss", mode='min', patience=10, stopping_threshold=0.05, check_finite=True, # Stops training if the monitored metric becomes NaN or infinite. ) trainer = Trainer.from_argparse_args( args, logger=comet_logger, strategy=ray_plugin, callbacks=[early_stopping_callback, checkpoint_callback, device_logger, lr_monitor], weights_save_path=checkpoint_storage ) print('Ready for training...') trainer.fit(model, datamodule=data) print('Post fit testing...') trainer.test(model, datamodule=data)