def predict(*args): model = SparkModelCache.get_or_load(archive_path) schema = {str(i): arg for i, arg in enumerate(args)} pdf = None for x in args: if type(x) == pandas.DataFrame: if len(args) != 1: raise Exception( "If passing a StructType column, there should be only one " "input column, but got %d" % len(args)) pdf = x if pdf is None: # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2) columns = [str(i) for i, _ in enumerate(args)] pdf = pandas.DataFrame(schema, columns=columns) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elif type(elem_type) == IntegerType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=(np.number, )).astype( np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=(np.number, )).astype( np.float64) if len(result.columns) == 0: raise MlflowException( message= "The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type)), error_code=INVALID_PARAMETER_VALUE) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series([row[1].values for row in result.iterrows()]) else: return result[result.columns[0]]
def predict(*args): model = SparkModelCache.get_or_load(archive_path) input_schema = model.metadata.get_input_schema() pdf = None for x in args: if type(x) == pandas.DataFrame: if len(args) != 1: raise Exception( "If passing a StructType column, there should be only one " "input column, but got %d" % len(args)) pdf = x if pdf is None: args = list(args) if input_schema is None: names = [str(i) for i in range(len(args))] else: names = input_schema.column_names() if len(args) > len(names): args = args[:len(names)] if len(args) < len(names): message = ( "Model input is missing columns. Expected {0} input columns {1}," " but the model received only {2} unnamed input columns" " (Since the columns were passed unnamed they are expected to be in" " the order specified by the schema).".format( len(names), names, len(args))) raise MlflowException(message) pdf = pandas.DataFrame( data={names[i]: x for i, x in enumerate(args)}, columns=names) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elem_type = result_type.elementType if isinstance( result_type, ArrayType) else result_type if type(elem_type) == IntegerType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=(np.number, )).astype( np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=(np.number, )).astype( np.float64) if len(result.columns) == 0: raise MlflowException( message= "The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type)), error_code=INVALID_PARAMETER_VALUE, ) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series(result.to_numpy().tolist()) else: return result[result.columns[0]]