Example #1
0
def eval_fn(
    predictor_kwargs: Dict[str, Any] = None,
    model_ref: ObjectRef = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    try:
        initialize_pytorch(horovod=hvd)

        eval_shard = RayDatasetShard(
            rt.get_dataset_shard("eval"),
            features,
            training_set_metadata,
        )

        model = ray.get(model_ref)
        device = get_torch_device()
        model = model.to(device)

        predictor = RemotePredictor(model=model,
                                    horovod=hvd,
                                    report_tqdm_to_ray=True,
                                    **predictor_kwargs)
        return predictor.batch_evaluation(eval_shard, **kwargs)
    finally:
        torch.cuda.empty_cache()
        hvd.shutdown()
Example #2
0
def tune_learning_rate_fn(
    dataset: RayDataset,
    config: Dict[str, Any],
    data_loader_kwargs: Dict[str, Any] = None,
    executable_kwargs: Dict[str, Any] = None,
    model: ECD = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
) -> float:
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    try:
        initialize_pytorch(horovod=hvd)

        pipe = dataset.pipeline(shuffle=False, **data_loader_kwargs)
        train_shard = RayDatasetShard(
            pipe,
            features,
            training_set_metadata,
        )

        device = get_torch_device()
        model = model.to(device)

        trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs)
        return trainer.tune_learning_rate(config, train_shard, **kwargs)
    finally:
        torch.cuda.empty_cache()
        hvd.shutdown()
Example #3
0
    def __init__(self, model: ECD, gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, **kwargs):
        horovod = initialize_horovod()
        initialize_pytorch(
            gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, horovod=horovod
        )
        super().__init__(model, horovod=horovod, **kwargs)

        # Only return results from rank 0 to reduce network overhead
        self.batch_predict = return_first(self.batch_predict)
        self.batch_evaluation = return_first(self.batch_evaluation)
Example #4
0
def train_fn(
    executable_kwargs: Dict[str, Any] = None,
    model: "LudwigModel" = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    initialize_pytorch(horovod=hvd)

    train_shard = RayDatasetShard(
        rt.get_dataset_shard("train"),
        features,
        training_set_metadata,
    )

    try:
        val_shard = rt.get_dataset_shard("val")
    except KeyError:
        val_shard = None

    if val_shard is not None:
        val_shard = RayDatasetShard(
            val_shard,
            features,
            training_set_metadata,
        )

    try:
        test_shard = rt.get_dataset_shard("test")
    except KeyError:
        test_shard = None

    if test_shard is not None:
        test_shard = RayDatasetShard(
            test_shard,
            features,
            training_set_metadata,
        )

    trainer = RemoteTrainer(model=model, **executable_kwargs)
    results = trainer.train(train_shard, val_shard, test_shard, **kwargs)

    # TODO(shreya): Figure out GPU memory leak
    # TODO(shreya): Check if placing model off GPU explicitly makes a difference
    # Clear CUDA memory, place model on CPU, return model to user
    # torch.cuda.empty_cache()
    # model.cpu()

    return results, trainer.validation_field, trainer.validation_metric
Example #5
0
def legacy_train_fn(
    trainer: RemoteTrainer = None,
    remote_model: "LudwigModel" = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    train_shards: List[DatasetPipeline] = None,
    val_shards: List[DatasetPipeline] = None,
    test_shards: List[DatasetPipeline] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    initialize_pytorch(horovod=hvd)

    train_shard = RayDatasetShard(
        train_shards[hvd.rank()],
        features,
        training_set_metadata,
    )

    val_shard = val_shards[hvd.rank()] if val_shards else None
    if val_shard is not None:
        val_shard = RayDatasetShard(
            val_shard,
            features,
            training_set_metadata,
        )

    test_shard = test_shards[hvd.rank()] if test_shards else None
    if test_shard is not None:
        test_shard = RayDatasetShard(
            test_shard,
            features,
            training_set_metadata,
        )

    results = trainer.train(train_shard, val_shard, test_shard, **kwargs)
    return results
Example #6
0
 def initialize(self):
     self._horovod = initialize_horovod()
Example #7
0
 def __init__(self, **kwargs):
     horovod = initialize_horovod()
     super().__init__(horovod=horovod, **kwargs)
Example #8
0
def train_fn(
    executable_kwargs: Dict[str, Any] = None,
    model_ref: ObjectRef = None,  # noqa: F821
    training_set_metadata: Dict[str, Any] = None,
    features: Dict[str, Dict] = None,
    **kwargs,
):
    # Pin GPU before loading the model to prevent memory leaking onto other devices
    hvd = initialize_horovod()
    try:
        initialize_pytorch(horovod=hvd)

        train_shard = RayDatasetShard(
            rt.get_dataset_shard("train"),
            features,
            training_set_metadata,
        )

        try:
            val_shard = rt.get_dataset_shard("val")
        except KeyError:
            val_shard = None

        if val_shard is not None:
            val_shard = RayDatasetShard(
                val_shard,
                features,
                training_set_metadata,
            )

        try:
            test_shard = rt.get_dataset_shard("test")
        except KeyError:
            test_shard = None

        if test_shard is not None:
            test_shard = RayDatasetShard(
                test_shard,
                features,
                training_set_metadata,
            )

        model = ray.get(model_ref)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)

        trainer = RemoteTrainer(model=model, horovod=hvd, **executable_kwargs)
        results = trainer.train(train_shard, val_shard, test_shard, **kwargs)

        if results is not None:
            # only return the model state dict back to the head node.
            trained_model, *args = results
            results = (trained_model.cpu().state_dict(), *args)

        torch.cuda.empty_cache()

        train_results = results, trainer.validation_field, trainer.validation_metric

    finally:
        hvd.shutdown()
    return train_results