def record_outputs(**kwargs) -> str: """ Use this method to record outputs from a notebook. It will convert all outputs to a Flyte understandable format. For Files, Directories, please use FlyteFile or FlyteDirectory, or wrap up your paths in these decorators. """ if kwargs is None: return "" m = {} ctx = FlyteContext.current_context() for k, v in kwargs.items(): expected = TypeEngine.to_literal_type(type(v)) lit = TypeEngine.to_literal(ctx, python_type=type(v), python_val=v, expected=expected) m[k] = lit return LiteralMap(literals=m).to_flyte_idl()
def execute(self, **kwargs) -> Any: """ TODO: Figure out how to share FlyteContext ExecutionParameters with the notebook kernel (as notebook kernel is executed in a separate python process) For Spark, the notebooks today need to use the new_session or just getOrCreate session and get a handle to the singleton """ logger.info( f"Hijacking the call for task-type {self.task_type}, to call notebook." ) # Execute Notebook via Papermill. pm.execute_notebook(self._notebook_path, self.output_notebook_path, parameters=kwargs) # type: ignore outputs = self.extract_outputs(self.output_notebook_path) self.render_nb_html(self.output_notebook_path, self.rendered_output_path) m = {} if outputs: m = outputs.literals output_list = [] for k, type_v in self.python_interface.outputs.items(): if k == self._IMPLICIT_OP_NOTEBOOK: output_list.append(self.output_notebook_path) elif k == self._IMPLICIT_RENDERED_NOTEBOOK: output_list.append(self.rendered_output_path) elif k in m: v = TypeEngine.to_python_value( ctx=FlyteContext.current_context(), lv=m[k], expected_python_type=type_v) output_list.append(v) else: raise RuntimeError( f"Expected output {k} of type {v} not found in the notebook outputs" ) return tuple(output_list)
w.write(python_val) return Literal(scalar=Scalar( schema=Schema(remote_path, self._get_schema_type()))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[pyspark.sql.DataFrame]) -> T: if not (lv and lv.scalar and lv.scalar.schema): return pyspark.sql.DataFrame() r = SparkDataFrameSchemaReader(from_path=lv.scalar.schema.uri, cols=None, fmt=SchemaFormat.PARQUET) return r.all() # %% # Registers a handle for Spark DataFrame + Flyte Schema type transition # This allows open(pyspark.DataFrame) to be an acceptable type SchemaEngine.register_handler( SchemaHandler( "pyspark.sql.DataFrame-Schema", pyspark.sql.DataFrame, SparkDataFrameSchemaReader, SparkDataFrameSchemaWriter, handles_remote_io=True, )) # %% # This makes pyspark.DataFrame as a supported output/input type with flytekit. TypeEngine.register(SparkDataFrameTransformer())
remote_path = ctx.file_access.get_random_remote_directory() ctx.file_access.put_data(local_dir, remote_path, is_multipart=True) return Literal(scalar=Scalar( schema=Schema(remote_path, self._get_schema_type()))) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[modin.pandas.DataFrame], ) -> T: if not (lv and lv.scalar and lv.scalar.schema): return modin.pandas.DataFrame() local_dir = ctx.file_access.get_random_local_directory() ctx.file_access.download_directory(lv.scalar.schema.uri, local_dir) r = ModinPandasSchemaReader(from_path=local_dir, cols=None, fmt=SchemaFormat.PARQUET) return r.all() SchemaEngine.register_handler( SchemaHandler( "modin.pandas.Dataframe-Schema", modin.pandas.DataFrame, ModinPandasSchemaReader, ModinPandasSchemaWriter, )) TypeEngine.register(ModinPandasDataFrameTransformer())
def get_literal_type(self, t: Type[_params.ParameterRangeOneOf]) -> LiteralType: return primitives.Generic.to_flyte_literal_type() def to_literal( self, ctx: FlyteContext, python_val: _params.ParameterRangeOneOf, python_type: Type[_hpo_job_model.HyperparameterTuningJobConfig], expected: LiteralType, ) -> Literal: d = MessageToDict(python_val.to_flyte_idl()) return DictTransformer.dict_to_generic_literal(d) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[_params.ParameterRangeOneOf] ) -> _params.ParameterRangeOneOf: if lv and lv.scalar and lv.scalar.generic is not None: d = json.loads(json_format.MessageToJson(lv.scalar.generic)) o = _pb2_params.ParameterRangeOneOf() o = json_format.ParseDict(d, o) return _params.ParameterRangeOneOf.from_flyte_idl(o) return None # %% # Register the types TypeEngine.register(HPOTuningJobConfigTransformer()) TypeEngine.register(ParameterRangesTransformer())
"batch_request": final_batch_request, "expectation_suite_name": ge_conf.expectation_suite_name, }], ) final_result = convert_to_json_serializable( checkpoint_result.list_validation_results())[0] result_string = "" if final_result["success"] is False: for every_result in final_result["results"]: if every_result["success"] is False: result_string += ( every_result["expectation_config"]["kwargs"]["column"] + " -> " + every_result["expectation_config"]["expectation_type"] + "\n") # raise a Great Expectations' exception raise ValidationError( "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" + result_string) logger.info("Validation succeeded!") return typing.cast(GreatExpectationsType, return_dataset) TypeEngine.register(GreatExpectationsTypeTransformer())
self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[MyDataset] ) -> MyDataset: """ In this method, we want to be able to re-hydrate the custom object from Flyte Literal value. """ # Step 1: let's download remote data locally local_dir = ctx.file_access.get_random_local_directory() ctx.file_access.download_directory(lv.scalar.blob.uri, local_dir) # Step 2: create the ``MyDataset`` object return MyDataset(base_dir=local_dir) # %% # Before we can use MyDataset in our tasks, we need to let Flytekit know that ``MyDataset`` should be considered as a valid type. # This is done using :py:class:`~flytekit:flytekit.extend.TypeEngine`'s ``register`` method. TypeEngine.register(MyDatasetTransformer()) # %% # The new type should be ready to use! Let us write an example generator and consumer for this new datatype. @task def generate() -> MyDataset: d = MyDataset() for i in range(3): fp = d.new_file(f"x{i}") with open(fp, "w") as f: f.write(f"Contents of file{i}") return d
def get_literal_type(self, t: Type[DatasetProfileView]) -> LiteralType: return LiteralType(blob=self._TYPE_INFO) def to_literal( self, ctx: FlyteContext, python_val: DatasetProfileView, python_type: Type[DatasetProfileView], expected: LiteralType, ) -> Literal: remote_path = ctx.file_access.get_random_remote_directory() local_dir = ctx.file_access.get_random_local_path() python_val.write(local_dir) ctx.file_access.upload(local_dir, remote_path) return Literal(scalar=Scalar(blob=Blob(uri=remote_path, metadata=BlobMetadata(type=self._TYPE_INFO)))) def to_python_value(self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[DatasetProfileView]) -> T: local_dir = ctx.file_access.get_random_local_path() ctx.file_access.download(lv.scalar.blob.uri, local_dir) return DatasetProfileView.read(local_dir) def to_html( self, ctx: FlyteContext, python_val: DatasetProfileView, expected_python_type: Type[DatasetProfileView] ) -> str: pandas_profile = str(python_val.to_pandas().to_html()) header = str("<h1>Profile View</h1> \n") return header + pandas_profile TypeEngine.register(WhylogsDatasetProfileTransformer())
remote_path = ctx.file_access.get_random_remote_directory() ctx.file_access.put_data(local_dir, remote_path, is_multipart=True) return Literal(scalar=Scalar(schema=Schema( remote_path, self._get_schema_type(python_type)))) else: raise AssertionError( f"Only Pandas Dataframe object can be returned from a task, returned object type {type(python_val)}" ) def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[pandera.typing.DataFrame] ) -> pandera.typing.DataFrame: if not (lv and lv.scalar and lv.scalar.schema): raise AssertionError( "Can only covert a literal schema to a pandera schema") def downloader(x, y): ctx.file_access.download_directory(x, y) df = FlyteSchema( local_path=ctx.file_access.get_random_local_directory(), remote_path=lv.scalar.schema.uri, downloader=downloader, supported_mode=SchemaOpenMode.READ, ) return self._pandera_schema(expected_python_type)(df.open().all()) TypeEngine.register(PanderaTransformer())
ctx: FlyteContext, lv: Literal, expected_python_type: typing.Type[DoltTable], ) -> DoltTable: if not (lv and lv.scalar and lv.scalar.generic and "config" in lv.scalar.generic): raise ValueError("DoltTable requires DoltConfig to load python value") conf_dict = MessageToDict(lv.scalar.generic["config"]) conf = DoltConfig(**conf_dict) db = dolt.Dolt(conf.db_path) with tempfile.NamedTemporaryFile() as f: dolt_int.load( db=db, tablename=conf.tablename, sql=conf.sql, filename=f.name, branch_conf=conf.branch_conf, meta_conf=conf.meta_conf, remote_conf=conf.remote_conf, load_args=conf.io_args, ) df = pandas.read_csv(f) lv.data = df return lv TypeEngine.register(DoltTableNameTransformer())