Beispiel #1
0
class KerasEstimator(Estimator):
    def __init__(self, keras_model, metrics, model_dir, optimizer):
        self.model = KerasModel(keras_model, model_dir)
        self.load_checkpoint = False
        self.metrics = metrics
        self.tf_optimizer = None
        self.optimizer = optimizer
        from zoo.orca.learn.optimizers import Optimizer
        if self.optimizer is not None and isinstance(self.optimizer,
                                                     Optimizer):
            self.optimizer = self.optimizer.get_optimizer()
        self.log_dir = None
        self.app_name = None
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            feature_cols=None,
            label_cols=None,
            validation_data=None,
            hard_code_batch_size=False,
            session_config=None,
            checkpoint_trigger=None,
            auto_shard_files=True):
        """
        Train this keras model with train data.
        :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of
         label numpy arrays}
        If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple]
        :param epochs: number of epochs to train.
        :param batch_size: total batch size for each iteration.
        :param feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.
        :param validation_data: validation data. Validation data type should be the same
        as train data.
        :param hard_code_batch_size: whether hard code batch size for training. Default is False.
        :param session_config: tensorflow session configuration for training.
        Should be object of tf.ConfigProto
        :param checkpoint_trigger: when to trigger checkpoint during training.
        Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in training"
            assert label_cols is not None, \
                "label columns is None; it should not be None in training"

        if isinstance(data, tf.data.Dataset):
            assert isinstance(data.element_spec, tuple), \
                "If data is tf.data.Dataset, each element should be " \
                "(feature tensors, label tensor), where each feature/label tensor can be " \
                "either a single tensor or a tuple of tensors"
            if validation_data is not None:
                assert isinstance(validation_data, tf.data.Dataset), \
                    "train data and validation data should be both tf.data.Dataset"
                assert isinstance(validation_data.element_spec, tuple), \
                    "If validation_data is tf.data.Dataset, each element should be " \
                    "(feature tensors, label tensor), where each feature/label tensor can be " \
                    "either a single tensor or a tuple of tensors"

        if checkpoint_trigger is not None:
            checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger)

        if is_tf_data_dataset(data):
            data = data.map(_standardize_keras_target_data)
            validation_data = validation_data.map(
                _standardize_keras_target_data)

        memory_type = OrcaContext.train_data_store
        dataset = to_dataset(data,
                             batch_size=batch_size,
                             batch_per_thread=-1,
                             validation_data=validation_data,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=False,
                             shuffle=True,
                             auto_shard_files=auto_shard_files,
                             memory_type=memory_type)

        self.tf_optimizer = TFOptimizer.from_keras(
            self.model.model,
            dataset,
            model_dir=self.model.model_dir,
            session_config=session_config,
            metrics=self.metrics,
            optimizer=self.optimizer)

        if self.clip_norm:
            self.tf_optimizer.set_gradient_clipping_by_l2_norm(
                clip_norm=self.clip_norm)
        if self.clip_min and self.clip_max:
            self.tf_optimizer.set_constant_gradient_clipping(
                self.clip_min, self.clip_max)

        if self.load_checkpoint:
            self.tf_optimizer.load_checkpoint(self.checkpoint_path,
                                              self.checkpoint_version)

        if self.log_dir and self.app_name:
            self.tf_optimizer.estimator.set_tensorboard(
                self.log_dir, self.app_name)

        self.tf_optimizer.optimize(MaxEpoch(epochs),
                                   checkpoint_trigger=checkpoint_trigger)

        return self

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=False,
    ):
        """
        Predict input data
        :param data: data to be predicted.
        It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShard, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: if require hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes
         original columns plus 'prediction' column. The 'prediction' column can be FloatType,
         VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None,
                 hard_code_batch_size=False,
                 auto_shard_files=False):
        """
        Evaluate model.
        :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of
         label numpy arrays}
        If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple]
        :param batch_size: batch size per thread.
        :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for evaluation.
        :return: evaluation result as a dictionary of {'metric name': metric value}
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert label_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=True,
                             shuffle=False,
                             auto_shard_files=auto_shard_files)

        return self.model.evaluate(dataset, batch_per_thread=batch_size)

    def save_keras_model(self, path, overwrite=True):
        self.model.save_model(path, overwrite=overwrite)

    def get_model(self):
        return self.model.model

    def save(self, model_path, overwrite=True):
        self.save_keras_model(model_path, overwrite=True)

    def clear_gradient_clipping(self):
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def set_constant_gradient_clipping(self, min, max):
        assert min > 0, "clip value should be larger than 0"
        assert min < max, "clip max should be larger than clip min"
        self.clip_min = min
        self.clip_max = max

    def set_l2_norm_gradient_clipping(self, clip_norm):
        self.clip_norm = clip_norm

    def save_keras_weights(self, filepath, overwrite=True, save_format=None):
        self.model.save_weights(filepath, overwrite, save_format)

    def load_keras_weights(self, filepath, by_name=False):
        self.model.load_weights(filepath, by_name)
Beispiel #2
0
class KerasEstimator(Estimator):
    def __init__(self, keras_model, metrics, model_dir, optimizer):
        self.model = KerasModel(keras_model, model_dir)
        self.load_checkpoint = False
        self.metrics = metrics
        self.tf_optimizer = None
        self.optimizer = optimizer
        from zoo.orca.learn.optimizers import Optimizer
        if self.optimizer is not None and isinstance(self.optimizer,
                                                     Optimizer):
            self.optimizer = self.optimizer.get_optimizer()
        self.log_dir = None
        self.app_name = None
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            feature_cols=None,
            label_cols=None,
            validation_data=None,
            session_config=None,
            checkpoint_trigger=None,
            auto_shard_files=True):
        """
        Train this keras model with train data.

        :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature,
        'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple]
        :param epochs: number of epochs to train.
        :param batch_size: total batch size for each iteration.
        :param feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.
        :param validation_data: validation data. Validation data type should be the same
        as train data.
        :param session_config: tensorflow session configuration for training.
        Should be object of tf.ConfigProto
        :param checkpoint_trigger: when to trigger checkpoint during training.
        Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in training"
            assert label_cols is not None, \
                "label columns is None; it should not be None in training"

        if isinstance(data, tf.data.Dataset):
            assert isinstance(data.element_spec, tuple), \
                "If data is tf.data.Dataset, each element should be " \
                "(feature tensors, label tensor), where each feature/label tensor can be " \
                "either a single tensor or a tuple of tensors"
            if validation_data is not None:
                assert isinstance(validation_data, tf.data.Dataset), \
                    "train data and validation data should be both tf.data.Dataset"
                assert isinstance(validation_data.element_spec, tuple), \
                    "If validation_data is tf.data.Dataset, each element should be " \
                    "(feature tensors, label tensor), where each feature/label tensor can be " \
                    "either a single tensor or a tuple of tensors"

        if checkpoint_trigger is not None:
            checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger)

        if is_tf_data_dataset(data):
            data = data.map(_standardize_keras_target_data)
            validation_data = validation_data.map(
                _standardize_keras_target_data)

        memory_type = OrcaContext.train_data_store
        dataset = to_dataset(data,
                             batch_size=batch_size,
                             batch_per_thread=-1,
                             validation_data=validation_data,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=False,
                             sequential_order=False,
                             shuffle=True,
                             auto_shard_files=auto_shard_files,
                             memory_type=memory_type)

        self.tf_optimizer = TFOptimizer.from_keras(
            self.model.model,
            dataset,
            model_dir=self.model.model_dir,
            session_config=session_config,
            metrics=self.metrics,
            optimizer=self.optimizer)

        if self.clip_norm:
            self.tf_optimizer.set_gradient_clipping_by_l2_norm(
                clip_norm=self.clip_norm)
        if self.clip_min and self.clip_max:
            self.tf_optimizer.set_constant_gradient_clipping(
                self.clip_min, self.clip_max)

        if self.load_checkpoint:
            self.tf_optimizer.load_checkpoint(self.checkpoint_path,
                                              self.checkpoint_version)

        if self.log_dir and self.app_name:
            self.tf_optimizer.estimator.set_tensorboard(
                self.log_dir, self.app_name)

        self.tf_optimizer.optimize(MaxEpoch(epochs),
                                   checkpoint_trigger=checkpoint_trigger)

        return self

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted.
        It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature}, where feature is a
        numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes
         original columns plus 'prediction' column. The 'prediction' column can be FloatType,
         VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 label_cols=None,
                 auto_shard_files=False):
        """
        Evaluate model.

        :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature, 'y': label}, where
        feature(label) is a numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple]
        :param batch_size: batch size per thread.
        :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        :return: evaluation result as a dictionary of {'metric name': metric value}
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert label_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             label_cols=label_cols,
                             hard_code_batch_size=False,
                             sequential_order=True,
                             shuffle=False,
                             auto_shard_files=auto_shard_files)

        return self.model.evaluate(dataset, batch_per_thread=batch_size)

    def save_keras_model(self, path, overwrite=True):
        """
        Save tensorflow keras model in this estimator.

        :param path: keras model save path.
        :param overwrite: Whether to silently overwrite any existing file at the target location.
        """
        self.model.save_model(path, overwrite=overwrite)

    def get_model(self):
        """
        Get the trained Keras model

        :return: The trained Keras model
        """
        return self.model.model

    def save(self, model_path, overwrite=True):
        """
        Save model to model_path

        :param model_path: path to save the trained model.
        :param overwrite: Whether to silently overwrite any existing file at the target location.

        :return:
        """
        self.save_keras_model(model_path, overwrite=overwrite)

    def clear_gradient_clipping(self):
        """
        Clear gradient clipping parameters. In this case, gradient clipping will not be applied.
        In order to take effect, it needs to be called before fit.

        :return:
        """
        self.clip_norm = None
        self.clip_min = None
        self.clip_max = None

    def set_constant_gradient_clipping(self, min, max):
        """
        Set constant gradient clipping during the training process.
        In order to take effect, it needs to be called before fit.

        :param min: The minimum value to clip by.
        :param max: The maximum value to clip by.
        :return:
        """
        assert min > 0, "clip value should be larger than 0"
        assert min < max, "clip max should be larger than clip min"
        self.clip_min = min
        self.clip_max = max

    def set_l2_norm_gradient_clipping(self, clip_norm):
        """
        Clip gradient to a maximum L2-Norm during the training process.
        In order to take effect, it needs to be called before fit.

        :param clip_norm: Gradient L2-Norm threshold.
        :return:
        """
        self.clip_norm = clip_norm

    def save_keras_weights(self, filepath, overwrite=True, save_format=None):
        """
        Save tensorflow keras model weights in this estimator.

        :param filepath: keras model weights save path.
        :param overwrite: Whether to silently overwrite any existing file at the target location.
        :param save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
            `None` defaults to 'tf'.
        """
        self.model.save_weights(filepath, overwrite, save_format)

    def load_keras_weights(self, filepath, by_name=False):
        """
        Save tensorflow keras model in this estimator.

        :param filepath: keras model weights save path.
        :param by_name: Boolean, whether to load weights by name or by topological
            order. Only topological loading is supported for weight files in
            TensorFlow format.
        """
        self.model.load_weights(filepath, by_name)