def test_composite_udf(self):
        """ Composite Keras Image UDF registration """
        df = get_image_paths_df(self.sql)

        def keras_load_img(fpath):
            from keras.preprocessing.image import load_img, img_to_array
            import numpy as np
            from pyspark.sql import Row
            img = load_img(fpath, target_size=(299, 299))
            return img_to_array(img).astype(np.uint8)

        def pil_load_spimg(fpath):
            from PIL import Image
            import numpy as np
            img_arr = np.array(Image.open(fpath), dtype=np.uint8)
            # PIL is RGB, image schema is BGR => need to flip the channels
            return imageArrayToStruct(_reverseChannels(img_arr))

        def keras_load_spimg(fpath):
            # Keras loads image in RGB order, ImageSchema expects BGR => need to flip
            return imageArrayToStruct(_reverseChannels(keras_load_img(fpath)))

        # Load image with Keras and store it in our image schema
        JVMAPI.registerUDF('keras_load_spimg', keras_load_spimg,
                           ImageSchema.imageSchema['image'].dataType)
        JVMAPI.registerUDF('pil_load_spimg', pil_load_spimg,
                           ImageSchema.imageSchema['image'].dataType)

        # Register an InceptionV3 model
        registerKerasImageUDF("iv3_img_pred", InceptionV3(weights="imagenet"),
                              keras_load_img)

        run_sql = self.session.sql

        # Choice 1: manually chain the functions in SQL
        df1 = run_sql(
            "select iv3_img_pred(keras_load_spimg(fpath)) as preds from _test_image_paths_df"
        )
        preds1 = np.array(df1.select("preds").rdd.collect())

        # Choice 2: build a pipelined UDF and directly use it in SQL
        JVMAPI.registerPipeline("load_img_then_iv3_pred",
                                ["keras_load_spimg", "iv3_img_pred"])
        df2 = run_sql(
            "select load_img_then_iv3_pred(fpath) as preds from _test_image_paths_df"
        )
        preds2 = np.array(df2.select("preds").rdd.collect())

        # Choice 3: create the image tensor input table first and apply the Keras model
        df_images = run_sql(
            "select pil_load_spimg(fpath) as image from _test_image_paths_df")
        df_images.createOrReplaceTempView("_test_images_df")
        df3 = run_sql(
            "select iv3_img_pred(image) as preds from _test_images_df")
        preds3 = np.array(df3.select("preds").rdd.collect())

        self.assertTrue(len(preds1) == len(preds2))
        np.testing.assert_allclose(preds1, preds2)
        np.testing.assert_allclose(preds2, preds3)
    def test_composite_udf(self):
        """ Composite Keras Image UDF registration """
        df = get_image_paths_df(self.sql)

        def keras_load_img(fpath):
            from keras.preprocessing.image import load_img, img_to_array
            import numpy as np
            from pyspark.sql import Row
            img = load_img(fpath, target_size=(299, 299))
            return img_to_array(img).astype(np.uint8)

        def pil_load_spimg(fpath):
            from PIL import Image
            import numpy as np
            img_arr = np.array(Image.open(fpath), dtype=np.uint8)
            return imageArrayToStruct(img_arr)

        def keras_load_spimg(fpath):
            return imageArrayToStruct(keras_load_img(fpath))

        # Load image with Keras and store it in our image schema
        JVMAPI.registerUDF('keras_load_spimg', keras_load_spimg, imageSchema)
        JVMAPI.registerUDF('pil_load_spimg', pil_load_spimg, imageSchema)

        # Register an InceptionV3 model
        registerKerasImageUDF("iv3_img_pred",
                              InceptionV3(weights="imagenet"),
                              keras_load_img)

        run_sql = self.session.sql

        # Choice 1: manually chain the functions in SQL
        df1 = run_sql("select iv3_img_pred(keras_load_spimg(fpath)) as preds from _test_image_paths_df")
        preds1 = np.array(df1.select("preds").rdd.collect())

        # Choice 2: build a pipelined UDF and directly use it in SQL
        JVMAPI.registerPipeline("load_img_then_iv3_pred", ["keras_load_spimg", "iv3_img_pred"])
        df2 = run_sql("select load_img_then_iv3_pred(fpath) as preds from _test_image_paths_df")
        preds2 = np.array(df2.select("preds").rdd.collect())

        # Choice 3: create the image tensor input table first and apply the Keras model
        df_images = run_sql("select pil_load_spimg(fpath) as image from _test_image_paths_df")
        df_images.createOrReplaceTempView("_test_images_df")
        df3 = run_sql("select iv3_img_pred(image) as preds from _test_images_df")
        preds3 = np.array(df3.select("preds").rdd.collect())

        self.assertTrue(len(preds1) == len(preds2))
        np.testing.assert_allclose(preds1, preds2)
        np.testing.assert_allclose(preds2, preds3)
def registerKerasImageUDF(udf_name,
                          keras_model_or_file_path,
                          preprocessor=None):
    """
    Create a Keras image model as a Spark SQL UDF.
    The UDF takes a column (formatted in :py:const:`sparkdl.image.imageIO.imageSchema`)
    and produces the output of the given Keras model (e.g.
    for `Inception V3 <https://keras.io/applications/#inceptionv3]>`_
    it produces a real valued score vector over the ImageNet object categories).
    For other models, the output could have different meanings.
    Please consult the actual models specification.

    The user can provide an existing model in Keras as follows.

    .. code-block:: python

        from keras.applications import InceptionV3
        registerKerasImageUDF("udf_name", InceptionV3(weights="imagenet"))

    To use a customized Keras model, we can save it and pass the file path as parameter.

    .. code-block:: python

        # Assume we have a compiled and trained Keras model
        model.save('path/to/my/model.h5')

        registerKerasImageUDF("my_custom_keras_model_udf", "path/to/my/model.h5")

    If there are further preprocessing steps are required to prepare the images,
    the user has the option to provide a preprocessing function :py:obj:`preprocessor`.
    The :py:obj:`preprocessor` converts a file path into a image array.
    This function is usually introduced in Keras workflow, as in the following example.

    .. warning:: There is a performance penalty to use a :py:obj:`preprocessor` as it will
                 first convert the image into a file buffer and reloaded back.
                 This provides compatibility with the usual way Keras model input are preprocessed.
                 Please consider directly using Keras/TensorFlow layers for this purpose.

    .. code-block:: python

        def keras_load_img(fpath):
            from keras.preprocessing.image import load_img, img_to_array
            import numpy as np
            from pyspark.sql import Row
            img = load_img(fpath, target_size=(299, 299))
            return img_to_array(img).astype(np.uint8)

        registerKerasImageUDF("my_inception_udf", InceptionV3(weights="imagenet"), keras_load_img)


    If the `preprocessor` is not provided, we assume the function will be applied to
    a (struct) column encoded in [sparkdl.image.imageIO.imageSchema].
    The output will be a single (struct) column containing the resulting tensor data.

    :param udf_name: str, name of the UserDefinedFunction. If the name exists, it will be
    overwritten.
    :param keras_model_or_file_path: str or KerasModel,
                                     either a path to the HDF5 Keras model file
                                     or an actual loaded Keras model
    :param preprocessor: function, optional, a function that
                         converts image file path to image tensor/ndarray
                         in the correct shape to be served as input to the Keras model
    :return: :py:class:`GraphFunction`, the graph function for the Keras image model
    """
    warnings.warn(
        "registerKerasImageUDF() will be removed in the next release of sparkdl. "
        "Please use Pandas UDF for distributed model inference.",
        DeprecationWarning)
    ordered_udf_names = []
    keras_udf_name = udf_name
    if preprocessor is not None:
        # Spill the image structure to file and reload it
        # with the user provided preprocessing funcition
        preproc_udf_name = '{}__preprocess'.format(udf_name)
        ordered_udf_names.append(preproc_udf_name)
        JVMAPI.registerUDF(preproc_udf_name,
                           _serialize_and_reload_with(preprocessor),
                           ImageSchema.imageSchema['image'].dataType)
        keras_udf_name = '{}__model_predict'.format(udf_name)

    stages = [('spimg', buildSpImageConverter('RGB', "uint8")),
              ('model', GraphFunction.fromKeras(keras_model_or_file_path)),
              ('final', buildFlattener())]
    gfn = GraphFunction.fromList(stages)

    with IsolatedSession() as issn:
        _, fetches = issn.importGraphFunction(gfn, prefix='')
        makeGraphUDF(issn.graph, keras_udf_name, fetches)
        ordered_udf_names.append(keras_udf_name)

    if len(ordered_udf_names) > 1:
        msg = "registering pipelined UDF {udf} with stages {udfs}"
        msg = msg.format(udf=udf_name, udfs=ordered_udf_names)
        logger.info(msg)
        JVMAPI.registerPipeline(udf_name, ordered_udf_names)

    return gfn
def registerKerasImageUDF(udf_name, keras_model_or_file_path, preprocessor=None):
    """
    Create a Keras image model as a Spark SQL UDF.
    The UDF takes a column (formatted in :py:const:`sparkdl.image.imageIO.imageSchema`)
    and produces the output of the given Keras model (e.g.
    for `Inception V3 <https://keras.io/applications/#inceptionv3]>`_
    it produces a real valued score vector over the ImageNet object categories).
    For other models, the output could have different meanings.
    Please consult the actual models specification.

    The user can provide an existing model in Keras as follows.

    .. code-block:: python

        from keras.applications import InceptionV3
        registerKerasImageUDF("udf_name", InceptionV3(weights="imagenet"))

    To use a customized Keras model, we can save it and pass the file path as parameter.

    .. code-block:: python

        # Assume we have a compiled and trained Keras model
        model.save('path/to/my/model.h5')

        registerKerasImageUDF("my_custom_keras_model_udf", "path/to/my/model.h5")

    If there are further preprocessing steps are required to prepare the images,
    the user has the option to provide a preprocessing function :py:obj:`preprocessor`.
    The :py:obj:`preprocessor` converts a file path into a image array.
    This function is usually introduced in Keras workflow, as in the following example.

    .. warning:: There is a performance penalty to use a :py:obj:`preprocessor` as it will
                 first convert the image into a file buffer and reloaded back.
                 This provides compatibility with the usual way Keras model input are preprocessed.
                 Please consider directly using Keras/TensorFlow layers for this purpose.

    .. code-block:: python

        def keras_load_img(fpath):
            from keras.preprocessing.image import load_img, img_to_array
            import numpy as np
            from pyspark.sql import Row
            img = load_img(fpath, target_size=(299, 299))
            return img_to_array(img).astype(np.uint8)

        registerKerasImageUDF("my_inception_udf", InceptionV3(weights="imagenet"), keras_load_img)


    If the `preprocessor` is not provided, we assume the function will be applied to
    a (struct) column encoded in [sparkdl.image.imageIO.imageSchema].
    The output will be a single (struct) column containing the resulting tensor data.

    :param udf_name: str, name of the UserDefinedFunction. If the name exists, it will be
    overwritten.
    :param keras_model_or_file_path: str or KerasModel,
                                     either a path to the HDF5 Keras model file
                                     or an actual loaded Keras model
    :param preprocessor: function, optional, a function that
                         converts image file path to image tensor/ndarray
                         in the correct shape to be served as input to the Keras model
    :return: :py:class:`GraphFunction`, the graph function for the Keras image model
    """
    ordered_udf_names = []
    keras_udf_name = udf_name
    if preprocessor is not None:
        # Spill the image structure to file and reload it
        # with the user provided preprocessing funcition
        preproc_udf_name = '{}__preprocess'.format(udf_name)
        ordered_udf_names.append(preproc_udf_name)
        JVMAPI.registerUDF(
            preproc_udf_name,
            _serialize_and_reload_with(preprocessor),
            ImageSchema.imageSchema['image'].dataType)
        keras_udf_name = '{}__model_predict'.format(udf_name)

    stages = [('spimg', buildSpImageConverter('RGB', "uint8")),
              ('model', GraphFunction.fromKeras(keras_model_or_file_path)),
              ('final', buildFlattener())]
    gfn = GraphFunction.fromList(stages)

    with IsolatedSession() as issn:
        _, fetches = issn.importGraphFunction(gfn, prefix='')
        makeGraphUDF(issn.graph, keras_udf_name, fetches)
        ordered_udf_names.append(keras_udf_name)

    if len(ordered_udf_names) > 1:
        msg = "registering pipelined UDF {udf} with stages {udfs}"
        msg = msg.format(udf=udf_name, udfs=ordered_udf_names)
        logger.info(msg)
        JVMAPI.registerPipeline(udf_name, ordered_udf_names)

    return gfn