Ejemplo n.º 1
0
    def check_dataset(self, create_ds):

        seq = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(20, )),
            tf.keras.layers.Dense(10, activation="softmax")
        ])

        seq.compile(optimizer=tf.keras.optimizers.RMSprop(),
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
        model = KerasModel(seq)

        model.fit(create_ds("train"))
        model.predict(create_ds("predict")).collect()
        model.evaluate(create_ds("evaluate"))
Ejemplo n.º 2
0
    def test_predict_with_ndarray_distributed(self):

        keras_model = self.create_model()
        model = KerasModel(keras_model)

        x, y = self.create_training_data()

        results_pre = model.evaluate(x, y)

        pred_y = np.argmax(model.predict(x, distributed=True), axis=1)
        acc = np.average((pred_y == y))
        print(results_pre)
        assert np.square(acc - results_pre["acc"]) < 0.000001
Ejemplo n.º 3
0
    def test_predict_with_dataset(self):

        keras_model = self.create_model()
        model = KerasModel(keras_model)

        x, y = self.create_training_data()
        results_pre = model.evaluate(x, y)

        pred_y = np.argmax(np.array(model.predict(
            self.create_predict_dataset()).collect()), axis=1)

        acc = np.average((pred_y == y))

        assert np.square(acc - results_pre["acc"]) < 0.000001
Ejemplo n.º 4
0
    def test_dataset_without_batch(self):
        x = np.random.rand(20, 10)
        y = np.random.randint(0, 2, (20))

        rdd_x = self.sc.parallelize(x)
        rdd_y = self.sc.parallelize(y)

        rdd = rdd_x.zip(rdd_y)

        dataset = TFDataset.from_rdd(rdd,
                                     features=(tf.float32, [10]),
                                     labels=(tf.int32, []),
                                     names=["features", "labels"],
                                     val_rdd=rdd)

        keras_model = self.create_model()
        model = KerasModel(keras_model)
        self.intercept(
            lambda: model.fit(dataset), "The batch_size of TFDataset must be" +
            " specified when used in KerasModel fit.")

        dataset = TFDataset.from_rdd(
            rdd,
            features=(tf.float32, [10]),
            labels=(tf.int32, []),
            names=["features", "labels"],
        )
        self.intercept(
            lambda: model.evaluate(dataset),
            "The batch_per_thread of TFDataset must be " +
            "specified when used in KerasModel evaluate.")

        dataset = TFDataset.from_rdd(
            rdd_x,
            features=(tf.float32, [10]),
            names=["features", "labels"],
        )
        self.intercept(
            lambda: model.predict(dataset),
            "The batch_per_thread of TFDataset must be" +
            " specified when used in KerasModel predict.")
Ejemplo n.º 5
0
class KerasEstimator(Estimator):
    def __init__(self, keras_model, metrics, model_dir, optimizer):
        self.model = KerasModel(keras_model, model_dir)
        self.load_checkpoint = False
        self.metrics = metrics
        self.tf_optimizer = None
        self.optimizer = optimizer
        from bigdl.orca.learn.optimizers import Optimizer
        if self.optimizer is not None and isinstance(self.optimizer,
                                                     Optimizer):
            self.optimizer = self.optimizer.get_optimizer()
        self.log_dir = None
        self.app_name = None
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            feature_cols=None,
            label_cols=None,
            validation_data=None,
            session_config=None,
            checkpoint_trigger=None,
            auto_shard_files=True):
        """
        Train this keras model with train data.

        :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset.
               If data is XShards, each partition can be a Pandas DataFrame or a dictionary of
               {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of
               numpy arrays.
               If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor
               tuple]
        :param epochs: number of epochs to train.
        :param batch_size: total batch size for each iteration.
        :param feature_cols: feature column names if train data is Spark DataFrame or XShards
               of Pandas DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame or XShards of
               Pandas DataFrame.
        :param validation_data: validation data. Validation data type should be the same
               as train data.
        :param session_config: tensorflow session configuration for training.
               Should be object of tf.ConfigProto
        :param checkpoint_trigger: when to trigger checkpoint during training.
               Should be a bigdl.orca.learn.trigger, like EveryEpoch(), SeveralIteration(
               num_iterations),etc.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in training"
            assert label_cols is not None, \
                "label columns is None; it should not be None in training"

        if isinstance(data, tf.data.Dataset):
            assert isinstance(data.element_spec, tuple), \
                "If data is tf.data.Dataset, each element should be " \
                "(feature tensors, label tensor), where each feature/label tensor can be " \
                "either a single tensor or a tuple of tensors"
            if validation_data is not None:
                assert isinstance(validation_data, tf.data.Dataset), \
                    "train data and validation data should be both tf.data.Dataset"
                assert isinstance(validation_data.element_spec, tuple), \
                    "If validation_data is tf.data.Dataset, each element should be " \
                    "(feature tensors, label tensor), where each feature/label tensor can be " \
                    "either a single tensor or a tuple of tensors"

        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in training"
                assert label_cols is not None, \
                    "label columns is None; it should not be None in training"
                data, validation_data = process_xshards_of_pandas_dataframe(
                    data, feature_cols, label_cols, validation_data, "fit")

        if checkpoint_trigger is not None:
            checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger)

        if is_tf_data_dataset(data):
            data = data.map(_standardize_keras_target_data)
            validation_data = validation_data.map(
                _standardize_keras_target_data)

        memory_type = OrcaContext.train_data_store
        dataset = to_dataset(data,
                             batch_size=batch_size,
                             batch_per_thread=-1,
                             validation_data=validation_data,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=False,
                             sequential_order=False,
                             shuffle=True,
                             auto_shard_files=auto_shard_files,
                             memory_type=memory_type)

        self.tf_optimizer = TFOptimizer.from_keras(
            self.model.model,
            dataset,
            model_dir=self.model.model_dir,
            session_config=session_config,
            metrics=self.metrics,
            optimizer=self.optimizer)

        if self.clip_norm:
            self.tf_optimizer.set_gradient_clipping_by_l2_norm(
                clip_norm=self.clip_norm)
        if self.clip_min and self.clip_max:
            self.tf_optimizer.set_constant_gradient_clipping(
                self.clip_min, self.clip_max)

        if self.load_checkpoint:
            self.tf_optimizer.load_checkpoint(self.checkpoint_path,
                                              self.checkpoint_version)

        if self.log_dir and self.app_name:
            self.tf_optimizer.estimator.set_tensorboard(
                self.log_dir, self.app_name)

        self.tf_optimizer.optimize(MaxEpoch(epochs),
                                   checkpoint_trigger=checkpoint_trigger)

        return self

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted.
               It can be XShards, Spark DataFrame, or tf.data.Dataset.
               If data is XShards, each partition can be a Pandas DataFrame or a dictionary of
               {'x': feature}, where feature is a numpy array or a tuple of numpy arrays.
               If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame or
               XShards of Pandas DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
                 If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
                 and the schema for each result is: {'prediction': predicted numpy array or
                 list of predicted numpy arrays}.
                 If input data is Spark DataFrame, the predict result is a DataFrame which includes
                 original columns plus 'prediction' column. The 'prediction' column can be
                 FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in prediction"
                data = process_xshards_of_pandas_dataframe(data, feature_cols)

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None,
                 auto_shard_files=False):
        """
        Evaluate model.

        :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset.
               If data is XShards, each partition can be a Pandas DataFrame or a dictionary of
               {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of
               numpy arrays.
               If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor
               tuple]
        :param batch_size: batch size per thread.
        :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame or
               XShards of Pandas DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame or XShards
               of Pandas DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        :return: evaluation result as a dictionary of {'metric name': metric value}
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert label_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in evaluation"
                assert label_cols is not None, \
                    "label columns is None; it should not be None in evaluation"
                data = process_xshards_of_pandas_dataframe(
                    data, feature_cols, label_cols)

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=False,
                             sequential_order=True,
                             shuffle=False,
                             auto_shard_files=auto_shard_files)

        return self.model.evaluate(dataset, batch_per_thread=batch_size)

    @enable_multi_fs_save
    def save_keras_model(self, path, overwrite=True):
        """
        Save tensorflow keras model in this estimator.

        :param path: keras model save path.
        :param overwrite: Whether to silently overwrite any existing file at the target location.
        """
        self.model.save_model(path, overwrite=overwrite)

    def get_model(self):
        """
        Get the trained Keras model

        :return: The trained Keras model
        """
        return self.model.model

    @enable_multi_fs_save
    def save(self, model_path, overwrite=True):
        """
        Save model to model_path

        :param model_path: path to save the trained model.
        :param overwrite: Whether to silently overwrite any existing file at the target location.

        :return:
        """
        self.save_keras_model(model_path, overwrite=overwrite)

    @enable_multi_fs_load
    def load(self, model_path):
        """
        Load existing keras model

        :param model_path: Path to the existing keras model.
        :return:
        """
        self.model = KerasModel.load_model(model_path)

    def clear_gradient_clipping(self):
        """
        Clear gradient clipping parameters. In this case, gradient clipping will not be applied.
        In order to take effect, it needs to be called before fit.

        :return:
        """
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def set_constant_gradient_clipping(self, min, max):
        """
        Set constant gradient clipping during the training process.
        In order to take effect, it needs to be called before fit.

        :param min: The minimum value to clip by.
        :param max: The maximum value to clip by.
        :return:
        """
        assert min > 0, "clip value should be larger than 0"
        assert min < max, "clip max should be larger than clip min"
        self.clip_min = min
        self.clip_max = max

    def set_l2_norm_gradient_clipping(self, clip_norm):
        """
        Clip gradient to a maximum L2-Norm during the training process.
        In order to take effect, it needs to be called before fit.

        :param clip_norm: Gradient L2-Norm threshold.
        :return:
        """
        self.clip_norm = clip_norm

    @enable_multi_fs_save
    def save_keras_weights(self, filepath, overwrite=True, save_format=None):
        """
        Save tensorflow keras model weights in this estimator.

        :param filepath: keras model weights save path.
        :param overwrite: Whether to silently overwrite any existing file at the target location.
        :param save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
               '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
               `None` defaults to 'tf'.
        """
        self.model.save_weights(filepath, overwrite, save_format)

    @enable_multi_fs_load
    def load_keras_weights(self, filepath, by_name=False):
        """
        Save tensorflow keras model in this estimator.

        :param filepath: keras model weights save path.
        :param by_name: Boolean, whether to load weights by name or by topological
               order. Only topological loading is supported for weight files in
               TensorFlow format.
        """
        self.model.load_weights(filepath, by_name)