def train_fn( executable_kwargs: Dict[str, Any] = None, model: "LudwigModel" = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) trainer = RemoteTrainer(model=model, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) # TODO(shreya): Figure out GPU memory leak # TODO(shreya): Check if placing model off GPU explicitly makes a difference # Clear CUDA memory, place model on CPU, return model to user # torch.cuda.empty_cache() # model.cpu() return results, trainer.validation_field, trainer.validation_metric
def eval_fn( predictor_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) eval_shard = RayDatasetShard( rt.get_dataset_shard("eval"), features, training_set_metadata, ) model = ray.get(model_ref) device = get_torch_device() model = model.to(device) predictor = RemotePredictor(model=model, horovod=hvd, report_tqdm_to_ray=True, **predictor_kwargs) return predictor.batch_evaluation(eval_shard, **kwargs) finally: torch.cuda.empty_cache() hvd.shutdown()
def tune_learning_rate_fn( dataset: RayDataset, config: Dict[str, Any], data_loader_kwargs: Dict[str, Any] = None, executable_kwargs: Dict[str, Any] = None, model: ECD = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ) -> float: # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) pipe = dataset.pipeline(shuffle=False, **data_loader_kwargs) train_shard = RayDatasetShard( pipe, features, training_set_metadata, ) device = get_torch_device() model = model.to(device) trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs) return trainer.tune_learning_rate(config, train_shard, **kwargs) finally: torch.cuda.empty_cache() hvd.shutdown()
def legacy_train_fn( trainer: RemoteTrainer = None, remote_model: "LudwigModel" = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, train_shards: List[DatasetPipeline] = None, val_shards: List[DatasetPipeline] = None, test_shards: List[DatasetPipeline] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( train_shards[hvd.rank()], features, training_set_metadata, ) val_shard = val_shards[hvd.rank()] if val_shards else None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) test_shard = test_shards[hvd.rank()] if test_shards else None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) return results
def train_fn( executable_kwargs: Dict[str, Any] = None, model_ref: ObjectRef = None, # noqa: F821 training_set_metadata: Dict[str, Any] = None, features: Dict[str, Dict] = None, **kwargs, ): # Pin GPU before loading the model to prevent memory leaking onto other devices hvd = initialize_horovod() try: initialize_pytorch(horovod=hvd) train_shard = RayDatasetShard( rt.get_dataset_shard("train"), features, training_set_metadata, ) try: val_shard = rt.get_dataset_shard("val") except KeyError: val_shard = None if val_shard is not None: val_shard = RayDatasetShard( val_shard, features, training_set_metadata, ) try: test_shard = rt.get_dataset_shard("test") except KeyError: test_shard = None if test_shard is not None: test_shard = RayDatasetShard( test_shard, features, training_set_metadata, ) model = ray.get(model_ref) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs) results = trainer.train(train_shard, val_shard, test_shard, **kwargs) if results is not None: # only return the model state dict back to the head node. trained_model, *args = results results = (trained_model.cpu().state_dict(), *args) torch.cuda.empty_cache() train_results = results, trainer.validation_field, trainer.validation_metric finally: hvd.shutdown() return train_results