Beispiel #1
0
    def predict(self, data, batch_size=32):
        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"

        dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size)

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        return tfnet.predict(dataset)
Beispiel #2
0
 def test_tfdataset_with_string_rdd(self):
     string_rdd = self.sc.parallelize(["123", "456"], 1)
     ds = TFDataset.from_string_rdd(string_rdd, batch_per_thread=1)
     input_tensor = tf.placeholder(dtype=tf.string, shape=(None, ))
     output_tensor = tf.string_to_number(input_tensor)
     with tf.Session() as sess:
         tfnet = TFNet.from_session(sess,
                                    inputs=[input_tensor],
                                    outputs=[output_tensor])
     result = tfnet.predict(ds).collect()
     assert result[0] == 123
     assert result[1] == 456
Beispiel #3
0
    def predict(self,
                data,
                batch_size=4,
                feature_cols=None,
                hard_code_batch_size=False):
        """
        Predict input data
        :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        If data is tf.data.Dataset, each element is a tuple of input tensors.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             labels_cols=None,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=True,
                             shuffle=False)

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards) or isinstance(
                data, tf.data.Dataset):
            return convert_predict_to_xshard(predicted_rdd)
        else:
            return predicted_rdd
Beispiel #4
0
    def predict(self, data, batch_size=32):
        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"

        if isinstance(data, SparkXShards):
            dataset = _xshards_to_tf_dataset(data,
                                             batch_per_thread=batch_size)
        elif isinstance(data, Dataset):
            dataset = TFDataDataset2(data, batch_size=-1,
                                     batch_per_thread=batch_size)
        else:
            raise ValueError("data must be a SparkXShards or an orca.data.tf.Dataset")

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs)
        return tfnet.predict(dataset)
Beispiel #5
0
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        auto_shard_files=False,
    ):
        """
        Predict input data

        :param data: data to be predicted. It can be XShards, Spark DataFrame.
               If data is XShards, each partition can be Pandas Dataframe or a dictionary of
               {'x': feature}, where feature is a numpy array or a tuple of numpy arrays.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame
        or XShards of Pandas DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
               and apply sharding on files, otherwise sharding on records. Default is False.
        :return: predicted result.
                 If input data is XShards or tf.data.Dataset, the predict result is a XShards, each
                 partition of the XShards is a dictionary of {'prediction': result}, where the
                 result is a numpy array or a list of numpy arrays.
                 If input data is Spark DataFrame, the predict result is a DataFrame which includes
                 original columns plus 'prediction' column. The 'prediction' column can be
                 FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"
        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                assert feature_cols is not None, \
                    "feature columns is None; it should not be None in prediction"
                data = process_xshards_of_pandas_dataframe(data, feature_cols)

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd
if __name__ == '__main__':

    sparkConf = init_spark_conf().setAppName("testNNClassifer").setMaster('local[1]')
    sc = init_nncontext(sparkConf)
    spark = SparkSession \
        .builder \
        .getOrCreate()

    with tf.Graph().as_default():
        input1 = tf.placeholder(dtype=tf.float32, shape=(None, 2))
        hidden = tf.layers.dense(input1, 4)
        output = tf.sigmoid(tf.layers.dense(hidden, 1))
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            net = TFNet.from_session(sess, [input1], [output], generate_backward=True)

    df = spark.createDataFrame(
        [(Vectors.dense([2.0, 1.0]), 1.0),
         (Vectors.dense([1.0, 2.0]), 0.0),
         (Vectors.dense([2.0, 1.0]), 1.0),
         (Vectors.dense([1.0, 2.0]), 0.0)],
        ["features", "label"])

    print("before training:")
    NNModel(net).transform(df).show()

    classifier = NNClassifier(net, MSECriterion()) \
        .setBatchSize(4) \
        .setOptimMethod(Adam()) \
        .setLearningRate(0.1) \