Esempio n. 1
0
    def fit(self, ds: ParallelPandasDataset, **kwargs) -> NoReturn:
        def model_creator(config):
            # https://github.com/ray-project/ray/issues/5914
            import tensorflow.keras as keras

            model: keras.Model = keras.models.model_from_json(
                self._serialized_model)
            optimizer = keras.optimizers.get(self._serialized_optimizer)
            loss = keras.losses.get(self._serialized_loss)
            metrics = [keras.metrics.get(m) for m in self._serialized_metrics]
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            return model

        data_set = TFDataset(ds, self._feature_columns, self._feature_types,
                             self._feature_shapes, self._label_column,
                             self._label_type, self._label_shape,
                             self._shuffle)

        def data_creator(config):
            return data_set.setup(config), None

        self._trainer = TFTrainer(model_creator, data_creator, self._config,
                                  self._num_workers)
        for i in range(self._num_epochs):
            stats = self._trainer.train()
            print(f"Epoch-{i}: {stats}")
Esempio n. 2
0
    def fit(self,
            train_ds: MLDataset,
            evaluate_ds: Optional[MLDataset] = None) -> NoReturn:
        super().fit(train_ds, evaluate_ds)

        def model_creator(config):
            # https://github.com/ray-project/ray/issues/5914
            import tensorflow.keras as keras  # pylint: disable=C0415, W0404

            model: keras.Model = keras.models.model_from_json(
                self._serialized_model)
            optimizer = keras.optimizers.get(self._serialized_optimizer)
            loss = keras.losses.get(self._serialized_loss)
            metrics = [keras.metrics.get(m) for m in self._serialized_metrics]
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            return model

        train_ds = train_ds.batch(self._batch_size)
        train_tf_ds = self._create_tf_ds(train_ds)

        if evaluate_ds is not None:
            evaluate_ds = evaluate_ds.batch(self._batch_size)
            evaluate_tf_ds = self._create_tf_ds(evaluate_ds)
        else:
            evaluate_tf_ds = None

        def data_creator(config):
            if "TF_CONFIG" in os.environ:
                tf_config = json.loads(os.environ["TF_CONFIG"])
                world_rank = tf_config["task"]["index"]
            else:
                world_rank = -1
            batch_size = config["batch_size"]
            get_shard_config = config.get("get_shard", {})
            if "shuffle" in config:
                get_shard_config["shuffle"] = config["shuffle"]
            train_data = train_tf_ds.get_shard(
                world_rank, **get_shard_config).repeat().batch(batch_size)
            options = tf.data.Options()
            options.experimental_distribute.auto_shard_policy = \
                tf.data.experimental.AutoShardPolicy.OFF
            train_data = train_data.with_options(options)
            evaluate_data = None
            if evaluate_tf_ds is not None:
                evaluate_data = evaluate_tf_ds.get_shard(
                    world_rank, **get_shard_config).batch(batch_size)
                evaluate_data = evaluate_data.with_options(options)
            return train_data, evaluate_data

        self._trainer = TFTrainer(model_creator=model_creator,
                                  data_creator=data_creator,
                                  num_replicas=self._num_workers,
                                  **self._extra_config)
        for i in range(self._num_epochs):
            stats = self._trainer.train()
            print(f"Epoch-{i}: {stats}")

        if evaluate_tf_ds is not None:
            print(self._trainer.validate())
Esempio n. 3
0
def test_tf_dataset(ray_start_4_cpus):  # noqa: F811
    num_points = 32 * 100 * 2
    data = [i * (1 / num_points) for i in range(num_points)]
    it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x])
    # this will create MLDataset with column RangeIndex(range(2))
    ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False)
    tf_ds = ds.to_tf(feature_columns=[0], label_column=1)
    trainer = TFTrainer(
        model_creator=model_creator,
        data_creator=make_data_creator(tf_ds),
        num_replicas=2,
        config={
            "batch_size": 32,
            "fit_config": {
                "steps_per_epoch": 100,
            },
        },
    )

    for _ in range(10):
        trainer.train()

    model = trainer.get_model()
    prediction = model.predict([0.5])[0][0]
    assert 0.4 <= prediction <= 0.6
    trainer.shutdown()
Esempio n. 4
0
    def fit(self, ds: ParallelPandasDataset, **kwargs) -> NoReturn:
        if "fit_config" not in self._config:
            self._config["fit_config"] = {}
        if "steps_per_epoch" not in self._config["fit_config"]:

            def count_fn(it) -> int:
                count = 0
                for pdf in it:
                    count += pdf.shape[0]
                return count

            minimum_records = min(ds.apply(count_fn))
            self._config["fit_config"][
                "steps_per_epoch"] = minimum_records // self._batch_size

        def model_creator(config):
            # https://github.com/ray-project/ray/issues/5914
            import tensorflow.keras as keras

            model: keras.Model = keras.models.model_from_json(
                self._serialized_model)
            optimizer = keras.optimizers.get(self._serialized_optimizer)
            loss = keras.losses.get(self._serialized_loss)
            metrics = [keras.metrics.get(m) for m in self._serialized_metrics]
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            return model

        data_set = TFDataset(ds, self._feature_columns, self._feature_types,
                             self._feature_shapes, self._label_column,
                             self._label_type, self._label_shape,
                             self._shuffle)

        def data_creator(config):
            return data_set.setup(config), None

        self._trainer = TFTrainer(model_creator, data_creator, self._config,
                                  self._num_workers)
        for i in range(self._num_epochs):
            stats = self._trainer.train()
            print(f"Epoch-{i}: {stats}")
Esempio n. 5
0
def test_train(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer = TFTrainer(model_creator=simple_model,
                        data_creator=simple_dataset,
                        num_replicas=num_replicas,
                        config=SIMPLE_CONFIG)

    train_stats1 = trainer.train()
    train_stats1.update(trainer.validate())

    train_stats2 = trainer.train()
    train_stats2.update(trainer.validate())
Esempio n. 6
0
class TFEstimator(EstimatorInterface, SparkEstimatorInterface):
    def __init__(self,
                 num_workers: int = 1,
                 model: keras.Model = None,
                 optimizer: Union[keras.optimizers.Optimizer, str] = None,
                 loss: Union[keras.losses.Loss, str] = None,
                 metrics: Union[List[keras.metrics.Metric], List[str]] = None,
                 feature_columns: Union[str, List[str]] = None,
                 feature_types: Optional[Union[DType, List[DType]]] = None,
                 feature_shapes: Optional[Union[TensorShape,
                                                List[TensorShape]]] = None,
                 label_column: str = None,
                 label_type: Optional[tf.DType] = None,
                 label_shape: Optional[tf.TensorShape] = None,
                 batch_size: int = 128,
                 num_epochs: int = 1,
                 shuffle: bool = True,
                 **extra_config):
        """A scikit-learn like API to distributed training Tensorflow Keras model.

        In the backend it leverage the ray.sgd.TorchTrainer.
        :param num_workers: the number of workers for distributed model training
        :param model: the model, it should be instance of tensorflow.keras.Model. We do not support
                      multiple output models.
        :param optimizer: the optimizer, it should be keras.optimizers.Optimizer instance or str.
                          We do not support multiple optimizers currently.
        :param loss: the loss, it should be keras.losses.Loss instance or str. We do not support
                     multiple losses.
        :param metrics: the metrics list. It could be None, a list of keras.metrics.Metric instance
                        or a list of str.
        :param feature_columns: the feature columns name.
               The inputs of the model will be match the feature columns.
               .. code-block:: python
                   feature_columns = ["x", "y", "z"]
                   # the input to the model will be (x_batch_tensor, y_batch_tensor, z_batch_tensor)
        :param feature_types: the type for each feature input. It must match the length of the
                              feature_columns if provided. It will be tf.float32 by default.
        :param feature_shapes: the shape for each feature input. It must match the length of the
                               feature_columns
        :param label_column: the label column name.
        :param label_type: the label type, it will be tf.float32 by default.
        :param label_shape: the label shape.
        :param batch_size: the batch size
        :param num_epochs: the number of epochs
        :param shuffle: whether input dataset should be shuffle, True by default.
        :param extra_config: extra config will fit into TFTrainer. You can also set
               the get_shard config with
               {"get_shard": {batch_ms=0, num_async=5, shuffle_buffer_size=2, seed=0}}.
               You can refer to the MLDataset.get_repeatable_shard for the parameters.
        """
        self._num_workers: int = num_workers

        # model
        assert model is not None, "model must be not be None"
        if isinstance(model, keras.Model):
            self._serialized_model = model.to_json()
        else:
            raise Exception(
                "Unsupported parameter, we only support tensorflow.keras.Model"
            )

        # optimizer
        # TODO: we should support multiple optimizers for multiple outputs model
        assert optimizer is not None, "optimizer must not be None"
        if isinstance(optimizer, str):
            # it is a str represents the optimizer
            _optimizer = optimizer
        elif isinstance(optimizer, keras.optimizers.Optimizer):
            _optimizer = keras.optimizers.serialize(optimizer)
        else:
            raise Exception(
                "Unsupported parameter, we only support keras.optimizers.Optimizer subclass "
                "instance or a str to represent the optimizer")
        self._serialized_optimizer = _optimizer

        # loss
        # TODO: we should support multiple losses for multiple outputs model
        assert loss is not None, "loss must not be None"
        if isinstance(loss, str):
            _loss = loss
        elif isinstance(loss, keras.losses.Loss):
            _loss = keras.losses.serialize(loss)
        else:
            raise Exception(
                "Unsupported parameter, we only support keras.losses.Loss subclass "
                "instance or a str to represents the loss)")
        self._serialized_loss = _loss

        # metrics
        if metrics is None:
            _metrics = None
        else:
            assert isinstance(metrics, list), "metrics must be a list"
            if isinstance(metrics[0], str):
                _metrics = metrics
            elif isinstance(metrics[0], keras.metrics.Metric):
                _metrics = [keras.metrics.serialize(m) for m in metrics]
            else:
                raise Exception(
                    "Unsupported parameter, we only support list of "
                    "keras.metrics.Metrics instances or list of str to")
        self._serialized_metrics = _metrics

        self._feature_columns = feature_columns
        self._feature_types = feature_types
        self._feature_shapes = feature_shapes
        self._label_column = label_column
        self._label_type = label_type
        self._label_shape = label_shape
        self._batch_size = batch_size
        self._extra_config = extra_config

        config = {"batch_size": self._batch_size, "shuffle": shuffle}
        if self._extra_config:
            if "config" in self._extra_config:
                self._extra_config["config"].update(config)
            else:
                self._extra_config["config"] = config
        else:
            self._extra_config = {"config": config}

        self._num_epochs: int = num_epochs

        self._trainer: TFTrainer = None

    def _create_tf_ds(self, ds: MLDataset) -> TFMLDataset:
        return ds.to_tf(self._feature_columns, self._feature_shapes,
                        self._feature_types, self._label_column,
                        self._label_shape, self._label_type)

    def fit(self,
            train_ds: MLDataset,
            evaluate_ds: Optional[MLDataset] = None) -> NoReturn:
        super(TFEstimator, self).fit(train_ds, evaluate_ds)

        def model_creator(config):
            # https://github.com/ray-project/ray/issues/5914
            import tensorflow.keras as keras

            model: keras.Model = keras.models.model_from_json(
                self._serialized_model)
            optimizer = keras.optimizers.get(self._serialized_optimizer)
            loss = keras.losses.get(self._serialized_loss)
            metrics = [keras.metrics.get(m) for m in self._serialized_metrics]
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            return model

        train_ds = train_ds.batch(self._batch_size)
        train_tf_ds = self._create_tf_ds(train_ds)

        if evaluate_ds is not None:
            evaluate_ds = evaluate_ds.batch(self._batch_size)
            evaluate_tf_ds = self._create_tf_ds(evaluate_ds)
        else:
            evaluate_tf_ds = None

        def data_creator(config):
            if "TF_CONFIG" in os.environ:
                tf_config = json.loads(os.environ["TF_CONFIG"])
                world_rank = tf_config["task"]["index"]
            else:
                world_rank = -1
            batch_size = config["batch_size"]
            get_shard_config = config.get("get_shard", {})
            if "shuffle" in config:
                get_shard_config["shuffle"] = config["shuffle"]
            train_data = train_tf_ds.get_shard(
                world_rank, **get_shard_config).repeat().batch(batch_size)
            evaluate_data = None
            if evaluate_tf_ds is not None:
                evaluate_data = evaluate_tf_ds.get_shard(
                    world_rank, **get_shard_config).batch(batch_size)
            return train_data, evaluate_data

        self._trainer = TFTrainer(model_creator=model_creator,
                                  data_creator=data_creator,
                                  num_replicas=self._num_workers,
                                  **self._extra_config)
        for i in range(self._num_epochs):
            stats = self._trainer.train()
            print(f"Epoch-{i}: {stats}")

        if evaluate_tf_ds is not None:
            print(self._trainer.validate())

    def fit_on_spark(self,
                     train_df: DF,
                     evaluate_df: OPTIONAL_DF = None,
                     fs_directory: Optional[str] = None,
                     compression: Optional[str] = None) -> NoReturn:
        super(TFEstimator, self).fit_on_spark(train_df, evaluate_df)
        train_df = self._check_and_convert(train_df)
        if evaluate_df is not None:
            evaluate_df = self._check_and_convert(evaluate_df)
        train_ds = create_ml_dataset_from_spark(train_df, self._num_workers,
                                                self._batch_size, fs_directory,
                                                compression)
        evaluate_ds = None
        if evaluate_df is not None:
            evaluate_ds = create_ml_dataset_from_spark(evaluate_df,
                                                       self._num_workers,
                                                       self._batch_size,
                                                       fs_directory,
                                                       compression)
        return self.fit(train_ds, evaluate_ds)

    def get_model(self) -> Any:
        assert self._trainer, "Trainer has not been created"
        return self._trainer.get_model()

    def save(self, file_path) -> NoReturn:
        assert self._trainer, "Trainer has not been created"
        self._trainer.save(file_path)

    def restore(self, file_path) -> NoReturn:
        assert self._trainer, "Trainer has not been created"
        self._trainer.restore(file_path)

    def shutdown(self) -> NoReturn:
        if self._trainer is not None:
            self._trainer.shutdown()
            del self._trainer
Esempio n. 7
0
class TFEstimator(EstimatorInterface, SparkEstimatorInterface):
    def __init__(self,
                 num_workers: int = 1,
                 model: keras.Model = None,
                 optimizer: Union[keras.optimizers.Optimizer, str] = None,
                 loss: Union[keras.losses.Loss, str] = None,
                 metrics: Union[List[keras.metrics.Metric], List[str]] = None,
                 feature_columns: Union[str, List[str]] = None,
                 feature_types: Optional[Union[DType, List[DType]]] = None,
                 feature_shapes: Optional[Union[TensorShape,
                                                List[TensorShape]]] = None,
                 label_column: str = None,
                 label_type: Optional[tf.DType] = None,
                 label_shape: Optional[tf.TensorShape] = None,
                 batch_size: int = 128,
                 num_epochs: int = 1,
                 shuffle: bool = True,
                 **extra_config):
        """
        A scikit-learn like API to distributed training Tensorflow Keras model. In the backend it
        leverage the ray.sgd.TorchTrainer.
        :param num_workers: the number of workers for distributed model training
        :param model: the model, it should be instance of tensorflow.keras.Model. We do not support
                      multiple output models.
        :param optimizer: the optimizer, it should be keras.optimizers.Optimizer instance or str.
                          We do not support multiple optimizers currently.
        :param loss: the loss, it should be keras.losses.Loss instance or str. We do not support
                     multiple losses.
        :param metrics: the metrics list. It could be None, a list of keras.metrics.Metric instance
                        or a list of str.
        :param feature_columns: the feature columns name.
        :param feature_types: the type for each feature input. It must match the length of the
                              feature_columns if provided. It will be tf.float32 by default.
        :param feature_shapes: the shape for each feature input. It must match the length of the
                               feature_columns
        :param label_column: the label column name.
        :param label_type: the label type, it will be tf.float32 by default.
        :param label_shape: the label shape.
        :param batch_size: the batch size
        :param num_epochs: the number of epochs
        :param shuffle: whether input dataset should be shuffle, True by default.
        :param extra_config: extra config will fit into TFTrainer.
        """
        self._num_workers: int = num_workers

        # model
        assert model is not None, "model must be not be None"
        if isinstance(model, keras.Model):
            self._serialized_model = model.to_json()
        else:
            raise Exception(
                "Unsupported parameter, we only support tensorflow.keras.Model"
            )

        # optimizer
        # TODO: we should support multiple optimizers for multiple outputs model
        assert optimizer is not None, "optimizer must not be None"
        if isinstance(optimizer, str):
            # it is a str represents the optimizer
            _optimizer = optimizer
        elif isinstance(optimizer, keras.optimizers.Optimizer):
            _optimizer = keras.optimizers.serialize(optimizer)
        else:
            raise Exception(
                "Unsupported parameter, we only support keras.optimizers.Optimizer subclass "
                "instance or a str to represent the optimizer")
        self._serialized_optimizer = _optimizer

        # loss
        # TODO: we should support multiple losses for multiple outputs model
        assert loss is not None, "loss must not be None"
        if isinstance(loss, str):
            _loss = loss
        elif isinstance(loss, keras.losses.Loss):
            _loss = keras.losses.serialize(loss)
        else:
            raise Exception(
                "Unsupported parameter, we only support keras.losses.Loss subclass "
                "instance or a str to represents the loss)")
        self._serialized_loss = _loss

        # metrics
        if metrics is None:
            _metrics = None
        else:
            assert isinstance(metrics, list), "metrics must be a list"
            if isinstance(metrics[0], str):
                _metrics = metrics
            elif isinstance(metrics[0], keras.metrics.Metric):
                _metrics = [keras.metrics.serialize(m) for m in metrics]
            else:
                raise Exception(
                    "Unsupported parameter, we only support list of "
                    "keras.metrics.Metrics instances or list of str to")
        self._serialized_metrics = _metrics

        self._feature_columns = feature_columns
        self._feature_types = feature_types
        self._feature_shapes = feature_shapes
        self._label_column = label_column
        self._label_type = label_type
        self._label_shape = label_shape
        self._batch_size = batch_size

        _config = {"batch_size": batch_size}
        if extra_config is not None:
            _config.update(extra_config)
        self._config = _config
        self._num_epochs: int = num_epochs
        self._shuffle: bool = shuffle

        self._trainer: TFTrainer = None

    def fit(self, ds: ParallelPandasDataset, **kwargs) -> NoReturn:
        if "fit_config" not in self._config:
            self._config["fit_config"] = {}
        if "steps_per_epoch" not in self._config["fit_config"]:

            def count_fn(it) -> int:
                count = 0
                for pdf in it:
                    count += pdf.shape[0]
                return count

            minimum_records = min(ds.apply(count_fn))
            self._config["fit_config"][
                "steps_per_epoch"] = minimum_records // self._batch_size

        def model_creator(config):
            # https://github.com/ray-project/ray/issues/5914
            import tensorflow.keras as keras

            model: keras.Model = keras.models.model_from_json(
                self._serialized_model)
            optimizer = keras.optimizers.get(self._serialized_optimizer)
            loss = keras.losses.get(self._serialized_loss)
            metrics = [keras.metrics.get(m) for m in self._serialized_metrics]
            model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
            return model

        data_set = TFDataset(ds, self._feature_columns, self._feature_types,
                             self._feature_shapes, self._label_column,
                             self._label_type, self._label_shape,
                             self._shuffle)

        def data_creator(config):
            return data_set.setup(config), None

        self._trainer = TFTrainer(model_creator, data_creator, self._config,
                                  self._num_workers)
        for i in range(self._num_epochs):
            stats = self._trainer.train()
            print(f"Epoch-{i}: {stats}")

    def fit_on_spark(self, df, **kwargs) -> NoReturn:
        super(TFEstimator, self).fit_on_spark(df, **kwargs)
        ds = save_to_ray(df, self._num_workers)
        self.fit(ds)

    def evaluate(self, df: ParallelPandasDataset, **kwargs) -> NoReturn:
        if self._trainer is None:
            raise Exception("Must call fit first")
        dataset = TFDataset(df, self._feature_columns, self._feature_types,
                            self._feature_shapes, self._label_column,
                            self._label_type, self._label_shape, self._shuffle)
        config = self._config
        tf_dataset: tf.data.Dataset = dataset.setup(config)
        model: keras.Model = self._trainer.get_model()
        result = model.evaluate(tf_dataset)
        print(result)

    def evaluate_on_spark(self, df, **kwargs) -> NoReturn:
        super(TFEstimator, self).evaluate_on_spark(df)
        if self._trainer is None:
            raise Exception("Must call fit first")
        pdf = df.toPandas()
        dataset = PandasTFDataset(pdf, self._feature_columns,
                                  self._feature_types, self._feature_shapes,
                                  self._label_column, self._label_type,
                                  self._label_shape, self._shuffle)
        config = self._config
        tf_dataset: tf.data.Dataset = dataset.setup(config)
        model: keras.Model = self._trainer.get_model()
        result = model.evaluate(tf_dataset)
        print(result)

    def get_model(self) -> Any:
        assert self._trainer, "Trainer has not been created"
        return self._trainer.get_model()

    def save(self, file_path) -> NoReturn:
        assert self._trainer, "Trainer has not been created"
        self._trainer.save(file_path)

    def restore(self, file_path) -> NoReturn:
        assert self._trainer, "Trainer has not been created"
        self._trainer.restore(file_path)

    def shutdown(self) -> NoReturn:
        if self._trainer is not None:
            self._trainer.shutdown()
            del self._trainer
Esempio n. 8
0
def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer1 = TFTrainer(
        model_creator=simple_model,
        data_creator=simple_dataset,
        num_replicas=num_replicas,
        config=SIMPLE_CONFIG,
    )
    trainer1.train()

    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "checkpoint")
    trainer1.save(filename)

    model1 = trainer1.get_model()
    trainer1.shutdown()

    trainer2 = TFTrainer(
        model_creator=simple_model,
        data_creator=simple_dataset,
        num_replicas=num_replicas,
        config=SIMPLE_CONFIG,
    )
    trainer2.restore(filename)

    model2 = trainer2.get_model()
    trainer2.shutdown()

    shutil.rmtree(tmpdir)

    model1.get_config()
    model2.get_config()

    model1_weights = model1.get_weights()
    model2_weights = model2.get_weights()
    assert _compare(model1_weights, model2_weights)

    model1.optimizer.get_weights()
    model2.optimizer.get_weights()