Exemple #1
0
    def test_model_cache(self):
        archive_path = SparkModelCache.add_local_model(self.spark,
                                                       self._model_path)
        assert archive_path != self._model_path

        # Ensure we can use the model locally.
        local_model = SparkModelCache.get_or_load(archive_path)
        assert local_model.__name__ == "ConstPyfunc"

        # Request the model on all executors, and see how many times we got cache hits.
        def get_model(_):
            model = SparkModelCache.get_or_load(archive_path)
            # NB: Can not use instanceof test as remote does not know about ConstPyfunc class
            assert model.__name__ == "ConstPyfunc"
            return SparkModelCache._cache_hits

        # This will run 30 distinct tasks, and we expect most to reuse an already-loaded model.
        # Note that we can't necessarily expect an even split, or even that there were only
        # exactly 2 python processes launched, due to Spark and its mysterious ways, but we do
        # expect significant reuse.
        results = self.spark.sparkContext.parallelize(range(
            0, 100), 30).map(get_model).collect()

        # TODO(tomas): Looks like spark does not reuse python workers with python==3.x
        assert sys.version[0] == '3' or max(results) > 10
        # Running again should see no newly-loaded models.
        results2 = self.spark.sparkContext.parallelize(range(
            0, 100), 30).map(get_model).collect()
        assert sys.version[0] == '3' or min(results2) > 0
Exemple #2
0
 def predict(*args):
     model = SparkModelCache.get_or_load(archive_path)
     schema = {str(i): arg for i, arg in enumerate(args)}
     # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2)
     columns = [str(i) for i, _ in enumerate(args)]
     pdf = pandas.DataFrame(schema, columns=columns)
     result = model.predict(pdf)
     return pandas.Series(result)
Exemple #3
0
    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        schema = {str(i): arg for i, arg in enumerate(args)}
        pdf = None
        for x in args:
            if type(x) == pandas.DataFrame:
                if len(args) != 1:
                    raise Exception(
                        "If passing a StructType column, there should be only one "
                        "input column, but got %d" % len(args))
                pdf = x
        if pdf is None:
            # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2)
            columns = [str(i) for i, _ in enumerate(args)]
            pdf = pandas.DataFrame(schema, columns=columns)
        result = model.predict(pdf)
        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elif type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)

        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float64)

        if len(result.columns) == 0:
            raise MlflowException(
                message=
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)),
                error_code=INVALID_PARAMETER_VALUE)

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series([row[1].values for row in result.iterrows()])
        else:
            return result[result.columns[0]]
Exemple #4
0
def test_model_cache(spark, model_path):
    mlflow.pyfunc.save_model(
        path=model_path,
        loader_module=__name__,
        code_path=[os.path.dirname(tests.__file__)],
    )

    archive_path = SparkModelCache.add_local_model(spark, model_path)
    assert archive_path != model_path

    # Define the model class name as a string so that each Spark executor can reference it
    # without attempting to resolve ConstantPyfuncWrapper, which is only available on the driver.
    constant_model_name = ConstantPyfuncWrapper.__name__

    def check_get_or_load_return_value(model_from_cache,
                                       model_path_from_cache):
        assert model_path_from_cache != model_path
        assert os.path.isdir(model_path_from_cache)
        model2 = mlflow.pyfunc.load_model(model_path_from_cache)
        for model in [model_from_cache, model2]:
            assert isinstance(model, PyFuncModel)
            # NB: Can not use instanceof test as remote does not know about ConstantPyfuncWrapper
            # class.
            assert type(model._model_impl).__name__ == constant_model_name

    # Ensure we can use the model locally.
    local_model, local_model_path = SparkModelCache.get_or_load(archive_path)

    check_get_or_load_return_value(local_model, local_model_path)

    # Request the model on all executors, and see how many times we got cache hits.
    def get_model(_):
        executor_model, executor_model_path = SparkModelCache.get_or_load(
            archive_path)
        check_get_or_load_return_value(executor_model, executor_model_path)
        return SparkModelCache._cache_hits

    # This will run 30 distinct tasks, and we expect most to reuse an already-loaded model.
    # Note that we can't necessarily expect an even split, or even that there were only
    # exactly 2 python processes launched, due to Spark and its mysterious ways, but we do
    # expect significant reuse.
    results = spark.sparkContext.parallelize(range(100),
                                             30).map(get_model).collect()
    assert max(results) > 10
    # Running again should see no newly-loaded models.
    results2 = spark.sparkContext.parallelize(range(100),
                                              30).map(get_model).collect()
    assert min(results2) > 0
Exemple #5
0
    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        schema = {features[i]: arg for i, arg in enumerate(args)}
        pdf = None
        for x in args:
            if type(x) == pandas.DataFrame:
                if len(args) != 1:
                    raise Exception(
                        "If passing a StructType column, there should be only one "
                        "input column, but got %d" % len(args))
                pdf = x
        if pdf is None:
            pdf = pandas.DataFrame(schema)
        result = model.predict(pdf)
        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elif type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)

        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float64)

        if len(result.columns) == 0:
            raise ValueError(
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)))

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series([row[1].values for row in result.iterrows()])
        else:
            return result[result.columns[0]]
Exemple #6
0
def test_model_cache(spark, model_path):
    mlflow.pyfunc.save_model(
        dst_path=model_path,
        loader_module=__name__,
        code_path=[os.path.dirname(tests.__file__)],
    )

    archive_path = SparkModelCache.add_local_model(spark, model_path)
    assert archive_path != model_path

    # Ensure we can use the model locally.
    local_model = SparkModelCache.get_or_load(archive_path)
    assert isinstance(local_model, ConstantPyfuncWrapper)

    # Define the model class name as a string so that each Spark executor can reference it
    # without attempting to resolve ConstantPyfuncWrapper, which is only available on the driver.
    constant_model_name = ConstantPyfuncWrapper.__name__

    # Request the model on all executors, and see how many times we got cache hits.
    def get_model(_):
        model = SparkModelCache.get_or_load(archive_path)
        # NB: Can not use instanceof test as remote does not know about ConstantPyfuncWrapper class.
        assert type(model).__name__ == constant_model_name
        return SparkModelCache._cache_hits

    # This will run 30 distinct tasks, and we expect most to reuse an already-loaded model.
    # Note that we can't necessarily expect an even split, or even that there were only
    # exactly 2 python processes launched, due to Spark and its mysterious ways, but we do
    # expect significant reuse.
    results = spark.sparkContext.parallelize(range(0, 100),
                                             30).map(get_model).collect()

    # TODO(tomas): Looks like spark does not reuse python workers with python==3.x
    assert sys.version[0] == '3' or max(results) > 10
    # Running again should see no newly-loaded models.
    results2 = spark.sparkContext.parallelize(range(0, 100),
                                              30).map(get_model).collect()
    assert sys.version[0] == '3' or min(results2) > 0
Exemple #7
0
    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        input_schema = model.metadata.get_input_schema()
        pdf = None

        for x in args:
            if type(x) == pandas.DataFrame:
                if len(args) != 1:
                    raise Exception(
                        "If passing a StructType column, there should be only one "
                        "input column, but got %d" % len(args))
                pdf = x
        if pdf is None:
            args = list(args)
            if input_schema is None:
                names = [str(i) for i in range(len(args))]
            else:
                names = input_schema.column_names()
                if len(args) > len(names):
                    args = args[:len(names)]
                if len(args) < len(names):
                    message = (
                        "Model input is missing columns. Expected {0} input columns {1},"
                        " but the model received only {2} unnamed input columns"
                        " (Since the columns were passed unnamed they are expected to be in"
                        " the order specified by the schema).".format(
                            len(names), names, len(args)))
                    raise MlflowException(message)
            pdf = pandas.DataFrame(
                data={names[i]: x
                      for i, x in enumerate(args)}, columns=names)

        result = model.predict(pdf)

        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elem_type = result_type.elementType if isinstance(
            result_type, ArrayType) else result_type

        if type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)
        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float64)

        if len(result.columns) == 0:
            raise MlflowException(
                message=
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)),
                error_code=INVALID_PARAMETER_VALUE,
            )

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series(result.to_numpy().tolist())
        else:
            return result[result.columns[0]]
Exemple #8
0
 def predict(*args):
     model = SparkModelCache.get_or_load(archive_path)
     schema = {str(i): arg for i, arg in enumerate(args)}
     pdf = pandas.DataFrame(schema)
     result = model.predict(pdf)
     return pandas.Series(result)
Exemple #9
0
 def get_model(_):
     model = SparkModelCache.get_or_load(archive_path)
     assert isinstance(model, PyFuncModel)
     # NB: Can not use instanceof test as remote does not know about ConstantPyfuncWrapper class.
     assert type(model._model_impl).__name__ == constant_model_name
     return SparkModelCache._cache_hits
Exemple #10
0
 def get_model(_):
     executor_model, executor_model_path = SparkModelCache.get_or_load(archive_path)
     check_get_or_load_return_value(executor_model, executor_model_path)
     return SparkModelCache._cache_hits
Exemple #11
0
 def get_model(_):
     model = SparkModelCache.get_or_load(archive_path)
     # NB: Can not use instanceof test as remote does not know about ConstPyfunc class
     assert model.__name__ == "ConstPyfunc"
     return SparkModelCache._cache_hits
Exemple #12
0
 def get_model(_):
     model = SparkModelCache.get_or_load(archive_path)
     assert isinstance(model, KNeighborsClassifier)
     return SparkModelCache._cache_hits
Exemple #13
0
def predict(*args):
    import pandas
    from mlflow.pyfunc.spark_model_cache import SparkModelCache
    from mlflow.pyfunc import load_pyfunc  # pylint: disable=cyclic-import
    # elem_type = IntegerType
    elem_type = return_type

    if isinstance(elem_type, ArrayType):
        elem_type = elem_type.elementType

    supported_types = [
        IntegerType, LongType, FloatType, DoubleType, StringType
    ]

    if not any([isinstance(elem_type, x) for x in supported_types]):
        raise MlflowException(
            message=
            "Invalid result_type '{}'. Result type can only be one of or an array of one "
            "of the following types types: {}".format(str(elem_type),
                                                      str(supported_types)),
            error_code=INVALID_PARAMETER_VALUE)
    model = SparkModelCache.get_or_load(archive_path)
    # model = load_pyfunc(archive_path)
    schema = {str(i): arg for i, arg in enumerate(args)}
    # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2)
    columns = [str(i) for i, _ in enumerate(args)]
    pdf = pandas.DataFrame(schema, columns=columns)
    # model.predict(pdf)
    result = model.predict(pdf)
    if not isinstance(result, pandas.DataFrame):
        result = pandas.DataFrame(data=result)

    elif type(elem_type) == IntegerType:
        result = result.select_dtypes(
            [np.byte, np.ubyte, np.short, np.ushort,
             np.int32]).astype(np.int32)

    elif type(elem_type) == LongType:
        result = result.select_dtypes(
            [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

    elif type(elem_type) == FloatType:
        result = result.select_dtypes(include=(np.number, )).astype(np.float32)

    elif type(elem_type) == DoubleType:
        result = result.select_dtypes(include=(np.number, )).astype(np.float64)

    if len(result.columns) == 0:
        raise MlflowException(
            message=
            "The the model did not produce any values compatible with the requested "
            "type '{}'. Consider requesting udf with StringType or "
            "Arraytype(StringType).".format(str(elem_type)),
            error_code=INVALID_PARAMETER_VALUE)

    if type(elem_type) == StringType:
        result = result.applymap(str)

    if type(return_type) == ArrayType:
        return pandas.Series([row[1].values for row in result.iterrows()])
    else:
        return result[result.columns[0]]