external_schema_type="arrow" if pa_schema else None, external_schema_bytes=typing.cast(pa.lib.Schema, pa_schema).to_string().encode() if pa_schema else None, ) def get_literal_type( self, t: typing.Union[Type[StructuredDataset], typing.Any]) -> LiteralType: """ Provide a concrete implementation so that writers of custom dataframe handlers since there's nothing that special about the literal type. Any dataframe type will always be associated with the structured dataset type. The other aspects of it - columns, external schema type, etc. can be read from associated metadata. :param t: The python dataframe type, which is mostly ignored. """ return LiteralType(structured_dataset_type=self._get_dataset_type(t)) def guess_python_type(self, literal_type: LiteralType) -> Type[T]: # todo: technically we should return the dataframe type specified in the constructor, but to do that, # we'd have to store that, which we don't do today. See possibly #1363 if literal_type.structured_dataset_type is not None: return StructuredDataset raise ValueError( f"StructuredDatasetTransformerEngine cannot reverse {literal_type}" ) flyte_dataset_transformer = StructuredDatasetTransformerEngine() TypeEngine.register(flyte_dataset_transformer)
blob=Blob(metadata=meta, uri=remote_path or source_path))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: typing.Type[FlyteFile]) -> FlyteFile: uri = lv.scalar.blob.uri # This is a local file path, like /usr/local/my_file, don't mess with it. Certainly, downloading it doesn't # make any sense. if not ctx.file_access.is_remote(uri): return expected_python_type(uri) # For the remote case, return an FlyteFile object that can download local_path = ctx.file_access.get_random_local_path(uri) def _downloader(): return ctx.file_access.get_data(uri, local_path, is_multipart=False) expected_format = FlyteFilePathTransformer.get_format( expected_python_type) ff = FlyteFile[expected_format](local_path, _downloader) ff._remote_source = uri return ff TypeEngine.register(FlyteFilePathTransformer())
def get_literal_type(self, t: Type[PipelineModel]) -> LiteralType: return LiteralType(blob=self._TYPE_INFO) def to_literal( self, ctx: FlyteContext, python_val: PipelineModel, python_type: Type[PipelineModel], expected: LiteralType, ) -> Literal: local_path = ctx.file_access.get_random_local_path() pathlib.Path(local_path).parent.mkdir(parents=True, exist_ok=True) python_val.save(local_path) remote_dir = ctx.file_access.get_random_remote_directory() ctx.file_access.upload_directory(local_path, remote_dir) return Literal(scalar=Scalar(blob=Blob( uri=remote_dir, metadata=BlobMetadata(type=self._TYPE_INFO)))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[PipelineModel]) -> PipelineModel: local_dir = ctx.file_access.get_random_local_directory() ctx.file_access.download_directory(lv.scalar.blob.uri, local_dir) return PipelineModel.load(local_dir) TypeEngine.register(PySparkPipelineModelTransformer())
fmt=SchemaFormat.PARQUET) w.write(python_val) remote_path = ctx.file_access.get_random_remote_directory() ctx.file_access.put_data(local_dir, remote_path, is_multipart=True) return Literal(scalar=Scalar( schema=Schema(remote_path, self._get_schema_type()))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[pandas.DataFrame]) -> pandas.DataFrame: if not (lv and lv.scalar and lv.scalar.schema): return pandas.DataFrame() local_dir = ctx.file_access.get_random_local_directory() ctx.file_access.get_data(lv.scalar.schema.uri, local_dir, is_multipart=True) r = PandasSchemaReader(local_dir=local_dir, cols=None, fmt=SchemaFormat.PARQUET) return r.all() def to_html(self, ctx: FlyteContext, python_val: pandas.DataFrame, expected_python_type: Type[T]): return python_val.describe().to_html() SchemaEngine.register_handler( SchemaHandler("pandas-dataframe-schema", pandas.DataFrame, PandasSchemaReader, PandasSchemaWriter)) TypeEngine.register(PandasDataFrameTransformer())
format=self.ONNX_FORMAT, dimensionality=BlobType.BlobDimensionality.SINGLE)), ))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[ONNXFile], ) -> ONNXFile: if not (lv.scalar.blob.uri and lv.scalar.blob.metadata.format == self.ONNX_FORMAT): raise TypeTransformerFailedError( f"ONNX format isn't of the expected type {expected_python_type}" ) return ONNXFile(path=lv.scalar.blob.uri) def guess_python_type(self, literal_type: LiteralType) -> Type[PyTorch2ONNX]: if (literal_type.blob is not None and literal_type.blob.dimensionality == BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == self.ONNX_FORMAT): return PyTorch2ONNX raise TypeTransformerFailedError( f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(PyTorch2ONNXTransformer())
metadata=BlobMetadata(type=BlobType( format=self.ONNX_FORMAT, dimensionality=BlobType.BlobDimensionality.SINGLE)), ))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[ONNXFile], ) -> ONNXFile: if not lv.scalar.blob.uri: raise TypeTransformerFailedError( f"ONNX format isn't of the expected type {expected_python_type}" ) return ONNXFile(path=lv.scalar.blob.uri) def guess_python_type(self, literal_type: LiteralType) -> Type[ScikitLearn2ONNX]: if (literal_type.blob is not None and literal_type.blob.dimensionality == BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == self.ONNX_FORMAT): return ScikitLearn2ONNX raise TypeTransformerFailedError( f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(ScikitLearn2ONNXTransformer())
local_dir = ctx.file_access.get_random_local_directory() os.makedirs(local_dir, exist_ok=True) local_path = ctx.file_access.get_random_local_path() uri = os.path.join(local_dir, local_path) with open(uri, "w+b") as outfile: cloudpickle.dump(python_val, outfile) remote_path = ctx.file_access.get_random_remote_path(uri) ctx.file_access.put_data(uri, remote_path, is_multipart=False) return Literal(scalar=Scalar(blob=Blob(metadata=meta, uri=remote_path))) def guess_python_type(self, literal_type: LiteralType) -> typing.Type[FlytePickle[typing.Any]]: if ( literal_type.blob is not None and literal_type.blob.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == FlytePickleTransformer.PYTHON_PICKLE_FORMAT ): return FlytePickle raise ValueError(f"Transformer {self} cannot reverse {literal_type}") def get_literal_type(self, t: Type[T]) -> LiteralType: return LiteralType( blob=_core_types.BlobType( format=self.PYTHON_PICKLE_FORMAT, dimensionality=_core_types.BlobType.BlobDimensionality.SINGLE ) ) TypeEngine.register(FlytePickleTransformer())
# make any sense. if not ctx.file_access.is_remote(uri): return expected_python_type(uri) # For the remote case, return an FlyteDirectory object that can download local_folder = ctx.file_access.get_random_local_directory() def _downloader(): return ctx.file_access.get_data(uri, local_folder, is_multipart=True) expected_format = self.get_format(expected_python_type) fd = FlyteDirectory.__class_getitem__(expected_format)(local_folder, _downloader) fd._remote_source = uri return fd def guess_python_type( self, literal_type: LiteralType ) -> typing.Type[FlyteDirectory[typing.Any]]: if (literal_type.blob is not None and literal_type.blob.dimensionality == _core_types.BlobType.BlobDimensionality.MULTIPART): return FlyteDirectory.__class_getitem__(literal_type.blob.format) raise ValueError(f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(FlyteDirToMultipartBlobTransformer())
metadata=BlobMetadata( type=BlobType(format=self.ONNX_FORMAT, dimensionality=BlobType.BlobDimensionality.SINGLE) ), ) ) ) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[ONNXFile], ) -> ONNXFile: if not lv.scalar.blob.uri: raise TypeTransformerFailedError(f"ONNX format isn't of the expected type {expected_python_type}") return ONNXFile(path=lv.scalar.blob.uri) def guess_python_type(self, literal_type: LiteralType) -> Type[TensorFlow2ONNX]: if ( literal_type.blob is not None and literal_type.blob.dimensionality == BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == self.ONNX_FORMAT ): return TensorFlow2ONNX raise TypeTransformerFailedError(f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(TensorFlow2ONNXTransformer())
local_path=ctx.file_access.get_random_local_directory(), remote_path=lv.scalar.schema.uri, downloader=downloader, supported_mode=SchemaOpenMode.READ, ) def guess_python_type(self, literal_type: LiteralType) -> Type[T]: if not literal_type.schema: raise ValueError(f"Cannot reverse {literal_type}") columns = {} for literal_column in literal_type.schema.columns: if literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.INTEGER: columns[literal_column.name] = int elif literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.FLOAT: columns[literal_column.name] = float elif literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.STRING: columns[literal_column.name] = str elif literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.DATETIME: columns[literal_column.name] = _datetime.datetime elif literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.DURATION: columns[literal_column.name] = _datetime.timedelta elif literal_column.type == SchemaType.SchemaColumn.SchemaColumnType.BOOLEAN: columns[literal_column.name] = bool else: raise ValueError( f"Unknown schema column type {literal_column}") return FlyteSchema[columns] TypeEngine.register(FlyteSchemaTransformer())
# This is a local file path, like /usr/local/my_file, don't mess with it. Certainly, downloading it doesn't # make any sense. if not ctx.file_access.is_remote(uri): return expected_python_type(uri) # For the remote case, return an FlyteFile object that can download local_path = ctx.file_access.get_random_local_path(uri) def _downloader(): return ctx.file_access.get_data(uri, local_path, is_multipart=False) expected_format = FlyteFilePathTransformer.get_format(expected_python_type) ff = FlyteFile.__class_getitem__(expected_format)(local_path, _downloader) ff._remote_source = uri return ff def guess_python_type(self, literal_type: LiteralType) -> typing.Type[FlyteFile[typing.Any]]: if ( literal_type.blob is not None and literal_type.blob.dimensionality == BlobType.BlobDimensionality.SINGLE and literal_type.blob.format != FlytePickleTransformer.PYTHON_PICKLE_FORMAT ): return FlyteFile.__class_getitem__(literal_type.blob.format) raise ValueError(f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(FlyteFilePathTransformer(), additional_types=[os.PathLike])
remote_path = ctx.file_access.get_random_remote_path(local_path) ctx.file_access.put_data(local_path, remote_path, is_multipart=False) return Literal(scalar=Scalar( blob=Blob(metadata=meta, uri=remote_path))) def to_python_value(self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[np.ndarray]) -> np.ndarray: try: uri = lv.scalar.blob.uri except AttributeError: TypeTransformerFailedError( f"Cannot convert from {lv} to {expected_python_type}") local_path = ctx.file_access.get_random_local_path() ctx.file_access.get_data(uri, local_path, is_multipart=False) # load numpy array from a file return np.load(file=local_path) def guess_python_type( self, literal_type: LiteralType) -> typing.Type[np.ndarray]: if (literal_type.blob is not None and literal_type.blob.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == self.NUMPY_ARRAY_FORMAT): return np.ndarray raise ValueError(f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(NumpyArrayTransformer())
) -> PyTorchCheckpoint: try: uri = lv.scalar.blob.uri except AttributeError: TypeTransformerFailedError(f"Cannot convert from {lv} to {expected_python_type}") local_path = ctx.file_access.get_random_local_path() ctx.file_access.get_data(uri, local_path, is_multipart=False) # cpu <-> gpu conversion if torch.cuda.is_available(): map_location = "cuda:0" else: map_location = torch.device("cpu") # load checkpoint from a file return typing.cast(PyTorchCheckpoint, torch.load(local_path, map_location=map_location)) def guess_python_type(self, literal_type: LiteralType) -> Type[PyTorchCheckpoint]: if ( literal_type.blob is not None and literal_type.blob.dimensionality == _core_types.BlobType.BlobDimensionality.SINGLE and literal_type.blob.format == self.PYTORCH_CHECKPOINT_FORMAT ): return PyTorchCheckpoint raise ValueError(f"Transformer {self} cannot reverse {literal_type}") TypeEngine.register(PyTorchCheckpointTransformer())