def test_enum_type(): t = TypeEngine.to_literal_type(Color) assert t is not None assert t.enum_type is not None assert t.enum_type.values assert t.enum_type.values == [c.value for c in Color] ctx = FlyteContextManager.current_context() lv = TypeEngine.to_literal(ctx, Color.RED, Color, TypeEngine.to_literal_type(Color)) assert lv assert lv.scalar assert lv.scalar.primitive.string_value == "red" v = TypeEngine.to_python_value(ctx, lv, Color) assert v assert v == Color.RED v = TypeEngine.to_python_value(ctx, lv, str) assert v assert v == "red" with pytest.raises(ValueError): TypeEngine.to_python_value(ctx, Literal(scalar=Scalar(primitive=Primitive(string_value=str(Color.RED)))), Color) with pytest.raises(ValueError): TypeEngine.to_python_value(ctx, Literal(scalar=Scalar(primitive=Primitive(string_value="bad"))), Color) with pytest.raises(AssertionError): TypeEngine.to_literal_type(UnsupportedEnumValues)
def test_type_engine(): t = int lt = TypeEngine.to_literal_type(t) assert lt.simple == model_types.SimpleType.INTEGER t = typing.Dict[str, typing.List[typing.Dict[str, timedelta]]] lt = TypeEngine.to_literal_type(t) assert lt.map_value_type.collection_type.map_value_type.simple == model_types.SimpleType.DURATION
def test_engine(): t = FlyteDirectory lt = TypeEngine.to_literal_type(t) assert lt.blob is not None assert lt.blob.dimensionality == BlobType.BlobDimensionality.MULTIPART assert lt.blob.format == "" t2 = FlyteDirectory["csv"] lt = TypeEngine.to_literal_type(t2) assert lt.blob is not None assert lt.blob.dimensionality == BlobType.BlobDimensionality.MULTIPART assert lt.blob.format == "csv"
def test_bad_conversion(): orig = FlyteSchema[kwtypes(my_custom=bool)] lt = TypeEngine.to_literal_type(orig) # Make a not real column type lt.schema.columns[0]._type = 15 with pytest.raises(ValueError): TypeEngine.guess_python_type(lt)
def test_to_python_value_without_incoming_columns(): # make a literal with a type with no columns ctx = FlyteContextManager.current_context() lt = TypeEngine.to_literal_type(pd.DataFrame) df = generate_pandas() fdt = StructuredDatasetTransformerEngine() lit = fdt.to_literal(ctx, df, python_type=pd.DataFrame, expected=lt) assert len(lit.scalar.structured_dataset.metadata.structured_dataset_type. columns) == 0 # declare a new type that only has one column # get the dataframe, make sure it has the column that was asked for. subset_sd_type = Annotated[StructuredDataset, kwtypes(age=int)] sd = fdt.to_python_value(ctx, lit, subset_sd_type) assert sd.metadata.structured_dataset_type.columns[0].name == "age" sub_df = sd.open(pd.DataFrame).all() assert sub_df.shape[1] == 1 # check when columns are not specified, should pull both and add column information. # todo: see the todos in the open_as, and iter_as functions in StructuredDatasetTransformerEngine # we have to recreate the literal because the test case above filled in the metadata lit = fdt.to_literal(ctx, df, python_type=pd.DataFrame, expected=lt) sd = fdt.to_python_value(ctx, lit, StructuredDataset) assert sd.metadata.structured_dataset_type.columns == [] sub_df = sd.open(pd.DataFrame).all() assert sub_df.shape[1] == 2 # should also work if subset type is just an annotated pd.DataFrame lit = fdt.to_literal(ctx, df, python_type=pd.DataFrame, expected=lt) subset_pd_type = Annotated[pd.DataFrame, kwtypes(age=int)] sub_df = fdt.to_python_value(ctx, lit, subset_pd_type) assert sub_df.shape[1] == 1
def test_to_python_value_with_incoming_columns(): # make a literal with a type that has two columns original_type = Annotated[pd.DataFrame, kwtypes(name=str, age=int)] ctx = FlyteContextManager.current_context() lt = TypeEngine.to_literal_type(original_type) df = generate_pandas() fdt = StructuredDatasetTransformerEngine() lit = fdt.to_literal(ctx, df, python_type=original_type, expected=lt) assert len(lit.scalar.structured_dataset.metadata.structured_dataset_type. columns) == 2 # declare a new type that only has one column # get the dataframe, make sure it has the column that was asked for. subset_sd_type = Annotated[StructuredDataset, kwtypes(age=int)] sd = fdt.to_python_value(ctx, lit, subset_sd_type) assert sd.metadata.structured_dataset_type.columns[0].name == "age" sub_df = sd.open(pd.DataFrame).all() assert sub_df.shape[1] == 1 # check when columns are not specified, should pull both and add column information. sd = fdt.to_python_value(ctx, lit, StructuredDataset) assert len(sd.metadata.structured_dataset_type.columns) == 2 # should also work if subset type is just an annotated pd.DataFrame subset_pd_type = Annotated[pd.DataFrame, kwtypes(age=int)] sub_df = fdt.to_python_value(ctx, lit, subset_pd_type) assert sub_df.shape[1] == 1
def test_fill_in_literal_type(): class TempEncoder(StructuredDatasetEncoder): def __init__(self, fmt: str): super().__init__(MyDF, "tmpfs://", supported_format=fmt) def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: return literals.StructuredDataset(uri="") StructuredDatasetTransformerEngine.register(TempEncoder("myavro"), default_for_type=True) lt = TypeEngine.to_literal_type(MyDF) assert lt.structured_dataset_type.format == "myavro" ctx = FlyteContextManager.current_context() fdt = StructuredDatasetTransformerEngine() sd = StructuredDataset(dataframe=42) l = fdt.to_literal(ctx, sd, MyDF, lt) # Test that the literal type is filled in even though the encode function above doesn't do it. assert l.scalar.structured_dataset.metadata.structured_dataset_type.format == "myavro" # Test that looking up encoders/decoders falls back to the "" encoder/decoder empty_format_temp_encoder = TempEncoder("") StructuredDatasetTransformerEngine.register(empty_format_temp_encoder, default_for_type=False) res = StructuredDatasetTransformerEngine.get_encoder( MyDF, "tmpfs", "rando") assert res is empty_format_temp_encoder
def add_workflow_output( self, output_name: str, p: Union[Promise, List[Promise], Dict[str, Promise]], python_type: Optional[Type] = None ): """ Add an output with the given name from the given node output. """ if output_name in self._python_interface.outputs: raise FlyteValidationException(f"Output {output_name} already exists in workflow {self.name}") if python_type is None: if type(p) == list or type(p) == dict: raise FlyteValidationException( f"If specifying a list or dict of Promises, you must specify the python_type type for {output_name}" f" starting with the container type (e.g. List[int]" ) python_type = p.ref.node.flyte_entity.python_interface.outputs[p.var] logger.debug(f"Inferring python type for wf output {output_name} from Promise provided {python_type}") flyte_type = TypeEngine.to_literal_type(python_type=python_type) ctx = FlyteContext.current_context() if ctx.compilation_state is not None: raise Exception("Can't already be compiling") with FlyteContextManager.with_context(ctx.with_compilation_state(self.compilation_state)) as ctx: b = binding_from_python_std( ctx, output_name, expected_literal_type=flyte_type, t_value=p, t_value_type=python_type ) self._output_bindings.append(b) self._python_interface = self._python_interface.with_outputs(extra_outputs={output_name: python_type}) self._interface = transform_interface_to_typed_interface(self._python_interface)
def test_jsondc_schemaize(): lt = TypeEngine.to_literal_type(Foo) pt = TypeEngine.guess_python_type(lt) # When postponed annotations are enabled, dataclass_json will not work and we'll end up with a # schemaless generic. # This test basically tests the broken behavior. Remove this test if # https://github.com/lovasoa/marshmallow_dataclass/issues/13 is ever fixed. assert pt is dict
def test_types_sd(): pt = StructuredDataset lt = TypeEngine.to_literal_type(pt) assert lt.structured_dataset_type is not None pt = Annotated[StructuredDataset, my_cols] lt = TypeEngine.to_literal_type(pt) assert len(lt.structured_dataset_type.columns) == 4 pt = Annotated[StructuredDataset, my_cols, "csv"] lt = TypeEngine.to_literal_type(pt) assert len(lt.structured_dataset_type.columns) == 4 assert lt.structured_dataset_type.format == "csv" pt = Annotated[StructuredDataset, {}, "csv"] lt = TypeEngine.to_literal_type(pt) assert len(lt.structured_dataset_type.columns) == 0 assert lt.structured_dataset_type.format == "csv"
def test_pb_guess_python_type(): artifact_tag = catalog_pb2.CatalogArtifactTag(artifact_id="artifact_1", name="artifact_name") x = {"a": artifact_tag} lt = TypeEngine.to_literal_type(catalog_pb2.CatalogArtifactTag) gt = TypeEngine.guess_python_type(lt) assert gt == catalog_pb2.CatalogArtifactTag ctx = FlyteContextManager.current_context() lm = TypeEngine.dict_to_literal_map(ctx, x, {"a": gt}) pv = TypeEngine.to_python_value(ctx, lm.literals["a"], gt) assert pv == artifact_tag
def test_format_correct(): class TempEncoder(StructuredDatasetEncoder): def __init__(self): super().__init__(pd.DataFrame, S3, "avro") def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: return literals.StructuredDataset( uri="/tmp/avro", metadata=StructuredDatasetMetadata(structured_dataset_type)) ctx = FlyteContextManager.current_context() df = pd.DataFrame({"name": ["Tom", "Joseph"], "age": [20, 22]}) annotated_sd_type = Annotated[StructuredDataset, "avro", kwtypes(name=str, age=int)] df_literal_type = TypeEngine.to_literal_type(annotated_sd_type) assert df_literal_type.structured_dataset_type is not None assert len(df_literal_type.structured_dataset_type.columns) == 2 assert df_literal_type.structured_dataset_type.columns[0].name == "name" assert df_literal_type.structured_dataset_type.columns[ 0].literal_type.simple is not None assert df_literal_type.structured_dataset_type.columns[1].name == "age" assert df_literal_type.structured_dataset_type.columns[ 1].literal_type.simple is not None assert df_literal_type.structured_dataset_type.format == "avro" sd = annotated_sd_type(df) with pytest.raises(ValueError): TypeEngine.to_literal(ctx, sd, python_type=annotated_sd_type, expected=df_literal_type) StructuredDatasetTransformerEngine.register(TempEncoder(), default_for_type=False) sd2 = annotated_sd_type(df) sd_literal = TypeEngine.to_literal(ctx, sd2, python_type=annotated_sd_type, expected=df_literal_type) assert sd_literal.scalar.structured_dataset.metadata.structured_dataset_type.format == "avro" @task def t1() -> Annotated[StructuredDataset, "avro"]: return StructuredDataset(dataframe=df) assert t1().file_format == "avro"
def test_to_literal(): ctx = FlyteContextManager.current_context() lt = TypeEngine.to_literal_type(pd.DataFrame) df = generate_pandas() fdt = StructuredDatasetTransformerEngine() lit = fdt.to_literal(ctx, df, python_type=pd.DataFrame, expected=lt) assert lit.scalar.structured_dataset.metadata.structured_dataset_type.format == PARQUET assert lit.scalar.structured_dataset.metadata.structured_dataset_type.format == PARQUET sd_with_literal_and_df = StructuredDataset(df) sd_with_literal_and_df._literal_sd = lit with pytest.raises(ValueError, match="Shouldn't have specified both literal"): fdt.to_literal(ctx, sd_with_literal_and_df, python_type=StructuredDataset, expected=lt) sd_with_nothing = StructuredDataset() with pytest.raises(ValueError, match="If dataframe is not specified"): fdt.to_literal(ctx, sd_with_nothing, python_type=StructuredDataset, expected=lt) sd_with_uri = StructuredDataset(uri="s3://some/extant/df.parquet") lt = TypeEngine.to_literal_type(Annotated[StructuredDataset, {}, "new-df-format"]) lit = fdt.to_literal(ctx, sd_with_uri, python_type=StructuredDataset, expected=lt) assert lit.scalar.structured_dataset.uri == "s3://some/extant/df.parquet" assert lit.scalar.structured_dataset.metadata.structured_dataset_type.format == "new-df-format"
def test_dataclass_complex_transform(two_sample_inputs): my_input = two_sample_inputs[0] my_input_2 = two_sample_inputs[1] ctx = FlyteContextManager.current_context() literal_type = TypeEngine.to_literal_type(MyInput) first_literal = TypeEngine.to_literal(ctx, my_input, MyInput, literal_type) assert first_literal.scalar.generic["apriori_config"] is not None converted_back_1 = TypeEngine.to_python_value(ctx, first_literal, MyInput) assert converted_back_1.apriori_config is not None second_literal = TypeEngine.to_literal(ctx, converted_back_1, MyInput, literal_type) assert second_literal.scalar.generic["apriori_config"] is not None converted_back_2 = TypeEngine.to_python_value(ctx, second_literal, MyInput) assert converted_back_2.apriori_config is not None input_list = [my_input, my_input_2] input_list_type = TypeEngine.to_literal_type(List[MyInput]) literal_list = TypeEngine.to_literal(ctx, input_list, List[MyInput], input_list_type) assert literal_list.collection.literals[0].scalar.generic["apriori_config"] is not None assert literal_list.collection.literals[1].scalar.generic["apriori_config"] is not None
def test_types_annotated(): pt = Annotated[pd.DataFrame, my_cols] lt = TypeEngine.to_literal_type(pt) assert len(lt.structured_dataset_type.columns) == 4 assert lt.structured_dataset_type.columns[ 0].literal_type.map_value_type.map_value_type.simple == SimpleType.INTEGER assert (lt.structured_dataset_type.columns[1].literal_type.collection_type. collection_type.simple == SimpleType.INTEGER) assert lt.structured_dataset_type.columns[ 2].literal_type.simple == SimpleType.INTEGER assert lt.structured_dataset_type.columns[ 3].literal_type.simple == SimpleType.STRING pt = Annotated[pd.DataFrame, PARQUET, arrow_schema] lt = TypeEngine.to_literal_type(pt) assert lt.structured_dataset_type.external_schema_type == "arrow" assert "some_string" in str( lt.structured_dataset_type.external_schema_bytes) pt = Annotated[pd.DataFrame, kwtypes(a=None)] with pytest.raises( AssertionError, match="type None is currently not supported by StructuredDataset"): TypeEngine.to_literal_type(pt)
def test_dataclass_transformer(): schema = { "$ref": "#/definitions/TeststructSchema", "$schema": "http://json-schema.org/draft-07/schema#", "definitions": { "InnerstructSchema": { "additionalProperties": False, "properties": { "a": {"format": "integer", "title": "a", "type": "number"}, "b": {"default": None, "title": "b", "type": ["string", "null"]}, "c": { "items": {"format": "integer", "title": "c", "type": "number"}, "title": "c", "type": "array", }, }, "type": "object", }, "TeststructSchema": { "additionalProperties": False, "properties": { "m": {"additionalProperties": {"title": "m", "type": "string"}, "title": "m", "type": "object"}, "s": {"$ref": "#/definitions/InnerstructSchema", "field_many": False, "type": "object"}, }, "type": "object", }, }, } tf = DataclassTransformer() t = tf.get_literal_type(TestStruct) assert t is not None assert t.simple is not None assert t.simple == SimpleType.STRUCT assert t.metadata is not None assert t.metadata == schema t = TypeEngine.to_literal_type(TestStruct) assert t is not None assert t.simple is not None assert t.simple == SimpleType.STRUCT assert t.metadata is not None assert t.metadata == schema t = tf.get_literal_type(UnsupportedNestedStruct) assert t is not None assert t.simple is not None assert t.simple == SimpleType.STRUCT assert t.metadata is None
def test_protos(): ctx = FlyteContext.current_context() pb = errors_pb2.ContainerError(code="code", message="message") lt = TypeEngine.to_literal_type(errors_pb2.ContainerError) assert lt.simple == SimpleType.STRUCT assert lt.metadata["pb_type"] == "flyteidl.core.errors_pb2.ContainerError" lit = TypeEngine.to_literal(ctx, pb, errors_pb2.ContainerError, lt) new_python_val = TypeEngine.to_python_value(ctx, lit, errors_pb2.ContainerError) assert new_python_val == pb # Test error l0 = Literal(scalar=Scalar(primitive=Primitive(integer=4))) with pytest.raises(AssertionError): TypeEngine.to_python_value(ctx, l0, errors_pb2.ContainerError)
def test_str_input(folders_and_files_setup): proxy_c = MyProxyConfiguration(splat_data_dir="/tmp/proxy_splat", apriori_file="/opt/config/a_file") proxy_p = MyProxyParameters(id="pp_id", job_i_step=1) # Intentionally passing in the wrong type my_input = MyInput( main_product=folders_and_files_setup[0], # noqa apriori_config=MyAprioriConfiguration( static_data_dir=FlyteDirectory("gs://my-bucket/one"), external_data_dir=FlyteDirectory("gs://my-bucket/two"), ), proxy_config=proxy_c, proxy_params=proxy_p, ) ctx = FlyteContextManager.current_context() literal_type = TypeEngine.to_literal_type(MyInput) first_literal = TypeEngine.to_literal(ctx, my_input, MyInput, literal_type) assert first_literal.scalar.generic is not None
def transform_type(x: type, description: str = None) -> _interface_models.Variable: return _interface_models.Variable(type=TypeEngine.to_literal_type(x), description=description)
def test_types_pandas(): pt = pd.DataFrame lt = TypeEngine.to_literal_type(pt) assert lt.structured_dataset_type is not None assert lt.structured_dataset_type.format == PARQUET assert lt.structured_dataset_type.columns == []
def test_remaining_prims(): orig = FlyteSchema[kwtypes(my_dt=datetime, my_td=timedelta, my_b=bool)] lt = TypeEngine.to_literal_type(orig) pt = TypeEngine.guess_python_type(lt) lt2 = TypeEngine.to_literal_type(pt) assert lt == lt2
def test_schema_back_and_forth(): orig = FlyteSchema[kwtypes(TrackId=int, Name=str)] lt = TypeEngine.to_literal_type(orig) pt = TypeEngine.guess_python_type(lt) lt2 = TypeEngine.to_literal_type(pt) assert lt == lt2
def test_interface(): ctx = FlyteContextManager.current_context() lt = TypeEngine.to_literal_type(pd.DataFrame) df = pd.DataFrame({"name": ["Tom", "Joseph"], "age": [20, 22]}) annotated_sd_type = Annotated[StructuredDataset, kwtypes(name=str, age=int)] df_literal_type = TypeEngine.to_literal_type(annotated_sd_type) assert df_literal_type.structured_dataset_type is not None assert len(df_literal_type.structured_dataset_type.columns) == 2 assert df_literal_type.structured_dataset_type.columns[0].name == "name" assert df_literal_type.structured_dataset_type.columns[0].literal_type.simple is not None assert df_literal_type.structured_dataset_type.columns[1].name == "age" assert df_literal_type.structured_dataset_type.columns[1].literal_type.simple is not None sd = annotated_sd_type(df) sd_literal = TypeEngine.to_literal(ctx, sd, python_type=annotated_sd_type, expected=lt) lm = { "my_map": Literal( map=LiteralMap( literals={ "k1": Literal(scalar=Scalar(primitive=Primitive(string_value="v1"))), "k2": Literal(scalar=Scalar(primitive=Primitive(string_value="2"))), }, ) ), "my_list": Literal( collection=LiteralCollection( literals=[ Literal(scalar=Scalar(primitive=Primitive(integer=1))), Literal(scalar=Scalar(primitive=Primitive(integer=2))), Literal(scalar=Scalar(primitive=Primitive(integer=3))), ] ) ), "val_a": Literal(scalar=Scalar(primitive=Primitive(integer=21828))), "my_df": sd_literal, } variable_map = { "my_map": interface_models.Variable(type=TypeEngine.to_literal_type(typing.Dict[str, str]), description=""), "my_list": interface_models.Variable(type=TypeEngine.to_literal_type(typing.List[int]), description=""), "val_a": interface_models.Variable(type=TypeEngine.to_literal_type(int), description=""), "my_df": interface_models.Variable(type=df_literal_type, description=""), } lr = LiteralsResolver(lm, variable_map=variable_map, ctx=ctx) assert lr._ctx is ctx with pytest.raises(ValueError): lr["not"] # noqa with pytest.raises(ValueError): lr.get_literal("not") # Test that just using [] works, guessing from the Flyte type is invoked result = lr["my_list"] assert result == [1, 2, 3] # Test that using get works, guessing from the Flyte type is invoked result = lr.get("my_map") assert result == { "k1": "v1", "k2": "2", } # Getting the literal will return the Literal object itself assert lr.get_literal("my_df") is sd_literal guessed_df = lr["my_df"] # Based on guessing, so no column information assert len(guessed_df.metadata.structured_dataset_type.columns) == 0 guessed_df_2 = lr["my_df"] assert guessed_df is guessed_df_2 # Update type hints with the annotated type lr.update_type_hints({"my_df": annotated_sd_type}) del lr._native_values["my_df"] guessed_df = lr.get("my_df") # Using the user specified type, so number of columns is correct. assert len(guessed_df.metadata.structured_dataset_type.columns) == 2