def codegen_from_yaml( spark: SparkSession, uri: str, name: Optional[str] = None, options: Optional[Dict[str, str]] = None, ) -> str: """Generate code from a YAML file. Parameters ---------- spark : SparkSession A live spark session uri : str the model spec URI name : model name The name of the model. options : dict Optional parameters passed to the model. Returns ------- str Spark UDF function name for the generated data. """ with open_uri(uri) as fobj: spec = FileModelSpec(fobj, options=options) udf = udf_from_spec(spec) return register_udf(spark, udf, name)
def __init__( self, spec_uri: Union[str, Path], options: Optional[Dict[str, Any]] = None, validate: bool = True, ): with open_uri(spec_uri) as fobj: spec = yaml.load(fobj, Loader=yaml.FullLoader) self.base_dir = os.path.dirname(spec_uri) spec.setdefault("options", {}) if options: spec["options"].update(options) super().__init__(spec, validate=validate)
def codegen_from_yaml( spark: SparkSession, uri: str, name: Optional[str] = None, options: Optional[Dict[str, str]] = None, ) -> str: """Generate code from a YAML file. Parameters ---------- spark : SparkSession A live spark session uri : str the model spec URI name : model name The name of the model. options : dict Optional parameters passed to the model. Returns ------- str Spark UDF function name for the generated data. """ with open_uri(uri) as fobj: spec = ModelSpec(fobj, options=options) if spec.version != 1.0: raise SpecError( f"Only spec version 1.0 is supported, got {spec.version}" ) if spec.flavor == "pytorch": from rikai.spark.sql.codegen.pytorch import generate_udf udf = generate_udf( spec.uri, spec.schema, spec.options, pre_processing=spec.pre_processing, post_processing=spec.post_processing, ) else: raise SpecError(f"Unsupported model flavor: {spec.flavor}") func_name = f"{name}_{secrets.token_hex(4)}" spark.udf.register(func_name, udf) logger.info(f"Created model inference pandas_udf with name {func_name}") return func_name
def torch_inference_udf( iter: Iterator[pd.DataFrame], ) -> Iterator[pd.DataFrame]: with open_uri(model_uri) as fobj: model = torch.load(fobj) device = torch.device("cuda" if use_gpu else "cpu") model.to(device) model.eval() with torch.no_grad(): for series in iter: dataset = PandasDataset(series, transform=pre_processing) results = [] for batch in DataLoader( dataset, batch_size=batch_size, num_workers=num_workers, ): predictions = model(batch) if post_processing: predictions = post_processing(predictions) results.extend(predictions) yield pd.DataFrame(results)
def open(self, mode="rb") -> BinaryIO: """Open the asset and returned as random-accessible file object.""" return open_uri(self.uri, mode=mode)
def open(self, mode="rb") -> BinaryIO: """Open the asset and returned as random-accessible file object.""" if self.is_embedded: return BytesIO(self.data) return open_uri(self.uri, mode=mode)
def test_open_https_uri(): """Test support of https URI""" with open_uri(WIKIPEDIA) as fobj: assert len(fobj.read()) > 0
def load_model_from_uri(uri: str): with open_uri(uri) as fobj: return torch.load(fobj)