Python convert_predict_rdd_to_dataframe Examples, zoo.orca.learn.utils.convert_predict_rdd_to_dataframe Python Examples

Example #1

0

Show file

File: test_estimator_keras_for_spark.py Project: zzti-bsj/analytics-zoo

    def test_convert_predict_list_of_array(self):
        tf.reset_default_graph()

        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        rdd = sc.parallelize([(1, 2, 3), (4, 5, 6), (7, 8, 9)])
        df = rdd.toDF(["feature", "label", "c"])
        predict_rdd = df.rdd.map(lambda row: [np.array([1, 2]), np.array(0)])
        resultDF = convert_predict_rdd_to_dataframe(df, predict_rdd)
        resultDF.printSchema()
        print(resultDF.collect()[0])
        predict_rdd = df.rdd.map(lambda row: np.array(1))
        resultDF = convert_predict_rdd_to_dataframe(df, predict_rdd)
        resultDF.printSchema()
        print(resultDF.collect()[0])

Example #2

0

Show file

    def predict(self, data, batch_size=4, feature_cols=None):
        """
        Predict input data.

        :param data: data to be predicted. It can be an XShards or a Spark Dataframe.
               If it is an XShards, each partition is a dictionary of
               {'x': feature}, where feature is a numpy array or a list of numpy arrays.
        :param batch_size: batch size used for inference.
        :param feature_cols: Feature column name(s) of data. Only used when data
               is a Spark DataFrame. Default: None.
        :return: predicted result. The predict result is a XShards, each partition of the XShards
                 is a dictionary of {'prediction': result}, where result is a numpy array or a list
                 of numpy arrays.
        """
        from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
        if isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            data_rdd = data.rdd.flatMap(xshard_to_sample)

        elif isinstance(data, DataFrame):
            schema = data.schema
            data_rdd = data.rdd.map(
                lambda row: row_to_sample(row, schema, feature_cols, None))
        else:
            raise ValueError(
                "Data should be XShards, each element needs to be {'x': a feature "
                "numpy array}.")
        predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size)

        if isinstance(data, SparkXShards):
            result = convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            result = convert_predict_rdd_to_dataframe(data, predicted_rdd)
        return result

Example #3

0

Show file

File: estimator.py Project: leonardozcm/analytics-zoo

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=False,
    ):
        """
        Predict input data
        :param data: data to be predicted. It can be XShards, Spark DataFrame.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

Example #4

0

Show file

 def test_convert_predict_rdd_to_dataframe(self):
     rdd = self.sc.range(0, 100)
     df = rdd.map(lambda x: ([float(x)] * 50,
                             [int(np.random.randint(0, 2, size=()))])).toDF(
                                 ["feature", "label"])
     pred_rdd = rdd.map(lambda x: np.array([float(x)] * 50))
     result_df = convert_predict_rdd_to_dataframe(df, pred_rdd)
     expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
     assert result_df.selectExpr(expr).first()["error"] == 0

Example #5

0

Show file

File: estimator.py Project: gjlee0802/analytics-zoo

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted.
        It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature}, where feature is a
        numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes
         original columns plus 'prediction' column. The 'prediction' column can be FloatType,
         VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

Example #6

0

Show file

File: estimator.py Project: leonardozcm/analytics-zoo

    def predict(self, data, batch_size=4, feature_cols=None):
        from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
        if isinstance(data, SparkXShards):
            from zoo.orca.data.utils import xshard_to_sample
            data_rdd = data.rdd.flatMap(xshard_to_sample)

        elif isinstance(data, DataFrame):
            schema = data.schema
            data_rdd = data.rdd.map(
                lambda row: row_to_sample(row, schema, feature_cols, None))
        else:
            raise ValueError(
                "Data should be XShards, each element needs to be {'x': a feature "
                "numpy array}.")
        predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size)

        if isinstance(data, SparkXShards):
            result = convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            result = convert_predict_rdd_to_dataframe(data, predicted_rdd)
        return result

Example #7

0

Show file

File: estimator.py Project: fuckyouNoob/analytics-zoo

    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted. It can be XShards, Spark DataFrame.
               If data is XShards, each partition can be Pandas Dataframe or a dictionary of
               {'x': feature}, where feature is a numpy array or a tuple of numpy arrays.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame
        or XShards of Pandas DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
                 If input data is XShards or tf.data.Dataset, the predict result is a XShards, each
                 partition of the XShards is a dictionary of {'prediction': result}, where the
                 result is a numpy array or a list of numpy arrays.
                 If input data is Spark DataFrame, the predict result is a DataFrame which includes
                 original columns plus 'prediction' column. The 'prediction' column can be
                 FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"
        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in prediction"
                data = process_xshards_of_pandas_dataframe(data, feature_cols)

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd

Example #8

0

Show file

File: estimator.py Project: yangw1234/analytics-zoo

    def predict(self, data, feature_cols=None):
        """
        Predict input data

        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        from pyspark.sql import DataFrame

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        sc = init_nncontext()

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def delete_useless_result(data):
                shard, y = data
                data_length = len(shard["x"])
                return y[:data_length]

            result_rdd = xshards.rdd.zip(result_rdd).map(delete_useless_result)
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def update_shard(data):
                shard, y = data
                data_length = len(shard["x"])
                shard["prediction"] = y[:data_length]
                return shard

            return SparkXShards(data.rdd.zip(result_rdd).map(update_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / self.batch_size)
                arrays = np.array_split(data, split_num)
                data_length_list = list(map(lambda arr: len(arr), arrays))
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / self.batch_size)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)
                        data_length_list = list(
                            map(lambda arr: len(arr), x_part))

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            for i in range(0, len(result_arr_list)):
                result_arr_list[i] = result_arr_list[i][:data_length_list[i]]
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)

Example #9

0

Show file

    def predict(self, data, feature_cols=None, batch_size=4):
        """
        Predict input data

        :param batch_size: Int. Set batch Size, default is 4.
        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        sc = init_nncontext()
        model_bytes_broadcast = sc.broadcast(self.model_bytes)
        weight_bytes_broadcast = sc.broadcast(self.weight_bytes)

        def partition_inference(partition):
            model_bytes = model_bytes_broadcast.value
            weight_bytes = weight_bytes_broadcast.value
            partition = list(partition)
            data_num = len(partition)
            ie = IECore()
            config = {'CPU_THREADS_NUM': str(self.core_num)}
            ie.set_config(config, 'CPU')
            net = ie.read_network(model=model_bytes,
                                  weights=weight_bytes,
                                  init_from_buffer=True)
            net.batch_size = batch_size
            local_model = ie.load_network(network=net,
                                          device_name="CPU",
                                          num_requests=data_num)
            inputs = list(iter(local_model.requests[0].input_blobs))
            outputs = list(iter(local_model.requests[0].output_blobs))
            assert len(
                outputs) != 0, "The number of model outputs should not be 0."

            def add_elem(d):
                d_len = len(d)
                if d_len < batch_size:
                    rep_time = [1] * (d_len - 1)
                    rep_time.append(batch_size - d_len + 1)
                    return np.repeat(d, rep_time, axis=0), d_len
                else:
                    return d, d_len

            results = []
            for idx, batch_data in enumerate(partition):
                infer_request = local_model.requests[idx]
                input_dict = dict()
                elem_num = 0
                if isinstance(batch_data, list):
                    for i, input in enumerate(inputs):
                        input_dict[input], elem_num = add_elem(batch_data[i])
                else:
                    input_dict[inputs[0]], elem_num = add_elem(batch_data)
                infer_request.infer(input_dict)
                if len(outputs) == 1:
                    results.append(infer_request.output_blobs[
                        outputs[0]].buffer[:elem_num])
                else:
                    results.append(
                        list(
                            map(
                                lambda output: infer_request.output_blobs[
                                    output].buffer[:elem_num], outputs)))

            return results

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))

            def update_result_shard(data):
                shard, y = data
                shard["prediction"] = y
                return shard

            return SparkXShards(
                data.rdd.zip(result_rdd).map(update_result_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / batch_size)
                arrays = np.array_split(data, split_num)
                num_slices = min(split_num, self.node_num)
                data_rdd = sc.parallelize(arrays, numSlices=num_slices)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / batch_size)
                num_slices = min(split_num, self.node_num)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices)

            print("Partition number: ", data_rdd.getNumPartitions())
            result_rdd = data_rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            result_arr_list = result_rdd.collect()
            result_arr = None
            if isinstance(result_arr_list[0], list):
                result_arr = [
                    np.concatenate([r[i] for r in result_arr_list], axis=0)
                    for i in range(len(result_arr_list[0]))
                ]
            elif isinstance(result_arr_list[0], np.ndarray):
                result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)