Exemple #1
0
    def predict(self, data, batch_size=4, feature_cols=None):
        """
        Predict input data.

        :param data: data to be predicted. It can be an XShards or a Spark Dataframe.
               If it is an XShards, each partition is a dictionary of
               {'x': feature}, where feature is a numpy array or a list of numpy arrays.
        :param batch_size: batch size used for inference.
        :param feature_cols: Feature column name(s) of data. Only used when data
               is a Spark DataFrame. Default: None.
        :return: predicted result. The predict result is a XShards, each partition of the XShards
                 is a dictionary of {'prediction': result}, where result is a numpy array or a list
                 of numpy arrays.
        """
        from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
        if isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            data_rdd = data.rdd.flatMap(xshard_to_sample)

        elif isinstance(data, DataFrame):
            schema = data.schema
            data_rdd = data.rdd.map(
                lambda row: row_to_sample(row, schema, feature_cols, None))
        else:
            raise ValueError(
                "Data should be XShards, each element needs to be {'x': a feature "
                "numpy array}.")
        predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size)

        if isinstance(data, SparkXShards):
            result = convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            result = convert_predict_rdd_to_dataframe(data, predicted_rdd)
        return result
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=False,
    ):
        """
        Predict input data
        :param data: data to be predicted. It can be XShards, Spark DataFrame.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd
Exemple #3
0
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted.
        It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature}, where feature is a
        numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes
         original columns plus 'prediction' column. The 'prediction' column can be FloatType,
         VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd
Exemple #4
0
    def test_convert_predict_rdd_to_xshard(self):
        rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)
        pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)
 def predict(self, data, batch_size=4, feature_cols="features", sample_preprocessing=None):
     if isinstance(data, DataFrame):
         if isinstance(feature_cols, list):
             data, _, feature_cols = \
                 BigDLEstimator._combine_cols(data, feature_cols, col_name="features")
         self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols)
         if sample_preprocessing is not None:
             self.nn_model.setSamplePreprocessing(sample_preprocessing)
         return self.nn_model.transform(data)
     elif isinstance(data, SparkXShards):
         from zoo.orca.data.utils import xshard_to_sample
         from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
         sample_rdd = data.rdd.flatMap(xshard_to_sample)
         result_rdd = self.model.predict(sample_rdd)
         return convert_predict_rdd_to_xshard(data, result_rdd)
     else:
         raise ValueError("Data should be XShards or Spark DataFrame, but get " +
                          data.__class__.__name__)
Exemple #6
0
    def predict(self,
                data,
                batch_size=4,
                feature_cols="features",
                sample_preprocessing=None):
        """
        Predict input data

        :param data: predict input data. It can be XShards or Spark DataFrame.
        If data is XShards, each partition is a dictionary of  {'x': feature}, where feature
        is a numpy array or a list of numpy arrays.
        :param batch_size: Batch size used for inference. Default: 4.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
        DataFrame. Default: "features".
        :param sample_preprocessing: Used when data is a Spark DataFrame. If the user want change
        the default feature_preprocessing specified in Estimator.from_bigdl, the user can pass the
        new sample_preprocessing methods.
        :return: predicted result.
        If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        If input data is an XShards, the predict result is a XShards, each partition of the XShards
        is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy
        arrays.
        """
        if isinstance(data, DataFrame):
            if isinstance(feature_cols, list):
                data, _, feature_cols = \
                    BigDLEstimator._combine_cols(data, feature_cols, col_name="features")
            self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols)
            if sample_preprocessing is not None:
                self.nn_model.setSamplePreprocessing(sample_preprocessing)
            return self.nn_model.transform(data)
        elif isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
            sample_rdd = data.rdd.flatMap(xshard_to_sample)
            result_rdd = self.model.predict(sample_rdd)
            return convert_predict_rdd_to_xshard(data, result_rdd)
        else:
            raise ValueError(
                "Data should be XShards or Spark DataFrame, but get " +
                data.__class__.__name__)
    def predict(self, data, batch_size=4, feature_cols=None):
        from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
        if isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            data_rdd = data.rdd.flatMap(xshard_to_sample)

        elif isinstance(data, DataFrame):
            schema = data.schema
            data_rdd = data.rdd.map(
                lambda row: row_to_sample(row, schema, feature_cols, None))
        else:
            raise ValueError(
                "Data should be XShards, each element needs to be {'x': a feature "
                "numpy array}.")
        predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size)

        if isinstance(data, SparkXShards):
            result = convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            result = convert_predict_rdd_to_dataframe(data, predicted_rdd)
        return result
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted. It can be XShards, Spark DataFrame.
               If data is XShards, each partition can be Pandas Dataframe or a dictionary of
               {'x': feature}, where feature is a numpy array or a tuple of numpy arrays.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame
        or XShards of Pandas DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
                 If input data is XShards or tf.data.Dataset, the predict result is a XShards, each
                 partition of the XShards is a dictionary of {'prediction': result}, where the
                 result is a numpy array or a list of numpy arrays.
                 If input data is Spark DataFrame, the predict result is a DataFrame which includes
                 original columns plus 'prediction' column. The 'prediction' column can be
                 FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"
        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in prediction"
                data = process_xshards_of_pandas_dataframe(data, feature_cols)

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd