Ejemplo n.º 1
0
def test_nonfunction_task_and_df_input():
    @reference_task(
        project="flytesnacks",
        domain="development",
        name="ref_t1",
        version="fast56d8ce2e373baf011f4d3532e45f0a9b",
    )
    def ref_t1(
        dataframe: pd.DataFrame,
        imputation_method: str = "median",
    ) -> pd.DataFrame:
        ...

    @reference_task(
        project="flytesnacks",
        domain="development",
        name="ref_t2",
        version="aedbd6fe44051c171fd966c280c5c3036f658831",
    )
    def ref_t2(
        dataframe: pd.DataFrame,
        split_mask: int,
        num_features: int,
    ) -> pd.DataFrame:
        ...

    wb = ImperativeWorkflow(name="core.feature_engineering.workflow.fe_wf")
    wb.add_workflow_input("sqlite_archive", FlyteFile[typing.TypeVar("sqlite")])
    sql_task = SQLite3Task(
        name="dummy.sqlite.task",
        query_template="select * from data",
        inputs=kwtypes(),
        output_schema_type=FlyteSchema,
        task_config=SQLite3Config(
            uri="https://sample/data",
            compressed=True,
        ),
    )
    node_sql = wb.add_task(sql_task)
    node_t1 = wb.add_task(ref_t1, dataframe=node_sql.outputs["results"], imputation_method="mean")

    node_t2 = wb.add_task(
        ref_t2,
        dataframe=node_t1.outputs["o0"],
        split_mask=24,
        num_features=15,
    )
    wb.add_workflow_output("output_from_t3", node_t2.outputs["o0"], python_type=pd.DataFrame)

    wf_spec = get_serializable(OrderedDict(), serialization_settings, wb)
    assert len(wf_spec.template.nodes) == 3

    assert len(wf_spec.template.interface.inputs) == 1
    assert wf_spec.template.interface.inputs["sqlite_archive"].type.blob is not None

    assert len(wf_spec.template.interface.outputs) == 1
    assert wf_spec.template.interface.outputs["output_from_t3"].type.structured_dataset_type is not None
    assert wf_spec.template.interface.outputs["output_from_t3"].type.structured_dataset_type == StructuredDatasetType(
        format="parquet"
    )
Ejemplo n.º 2
0
def test_execute_sqlite3_task(flyteclient, flyte_workflows_register,
                              flyte_remote_env):
    remote = FlyteRemote(Config.auto(), PROJECT, "development")

    example_db = "https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip"
    interactive_sql_task = SQLite3Task(
        "basic_querying",
        query_template=
        "select TrackId, Name from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        output_schema_type=FlyteSchema[kwtypes(TrackId=int, Name=str)],
        task_config=SQLite3Config(
            uri=example_db,
            compressed=True,
        ),
    )
    registered_sql_task = remote.register(interactive_sql_task)
    execution = remote.execute(registered_sql_task,
                               inputs={"limit": 10},
                               wait=True)
    output = execution.outputs["results"]
    result = output.open().all()
    assert result.__class__.__name__ == "DataFrame"
    assert "TrackId" in result
    assert "Name" in result
Ejemplo n.º 3
0
def test_task_serialization():
    sql_task = SQLite3Task(
        "test",
        query_template=
        "select TrackId, Name from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        output_schema_type=FlyteSchema[kwtypes(TrackId=int, Name=str)],
        task_config=SQLite3Config(
            uri=EXAMPLE_DB,
            compressed=True,
        ),
    )

    tt = sql_task.serialize_to_model(sql_task.SERIALIZE_SETTINGS)

    assert tt.container.args == [
        "pyflyte-execute",
        "--inputs",
        "{{.input}}",
        "--output-prefix",
        "{{.outputPrefix}}",
        "--raw-output-data-prefix",
        "{{.rawOutputDataPrefix}}",
        "--resolver",
        "flytekit.core.python_customized_container_task.default_task_template_resolver",
        "--",
        "{{.taskTemplatePath}}",
        "flytekit.extras.sqlite3.task.SQLite3TaskExecutor",
    ]

    assert tt.custom[
        "query_template"] == "select TrackId, Name from tracks limit {{.inputs.limit}}"
    assert tt.container.image != ""
Ejemplo n.º 4
0
def test_task_static():
    tk = SQLite3Task(
        "test", query_template="select * from tracks", task_config=SQLite3Config(uri=EXAMPLE_DB, compressed=True,),
    )

    assert tk.output_columns is None

    df = tk()
    assert df is not None
Ejemplo n.º 5
0
def query_wf() -> int:
    df = SQLite3Task(
        name="cookbook.sqlite3.sample_inline",
        query_template=
        "select TrackId, Name from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        output_schema_type=FlyteSchema[kwtypes(TrackId=int, Name=str)],
        task_config=SQLite3Config(uri=EXAMPLE_DB, compressed=True),
    )(limit=100)
    return print_and_count_columns(df=df)
Ejemplo n.º 6
0
def test_task_schema():
    sql_task = SQLite3Task(
        "test",
        query_template="select TrackId, Name from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        output_schema_type=FlyteSchema[kwtypes(TrackId=int, Name=str)],
        task_config=SQLite3Config(uri=EXAMPLE_DB, compressed=True,),
    )

    assert sql_task.output_columns is not None
    df = sql_task(limit=1)
    assert df is not None
Ejemplo n.º 7
0
def test_workflow():
    @task
    def my_task(df: pandas.DataFrame) -> int:
        return len(df[df.columns[0]])

    sql_task = SQLite3Task(
        "test",
        query_template="select * from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        task_config=SQLite3Config(uri=EXAMPLE_DB, compressed=True,),
    )

    @workflow
    def wf(limit: int) -> int:
        return my_task(df=sql_task(limit=limit))

    assert wf(limit=5) == 5
Ejemplo n.º 8
0
def test_task_schema():
    # sqlite3_start
    DB_LOCATION = "https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip"

    sql_task = SQLite3Task(
        "test",
        query_template=
        "select TrackId, Name from tracks limit {{.inputs.limit}}",
        inputs=kwtypes(limit=int),
        output_schema_type=FlyteSchema[kwtypes(TrackId=int, Name=str)],
        task_config=SQLite3Config(
            uri=DB_LOCATION,
            compressed=True,
        ),
    )
    # sqlite3_end

    assert sql_task.output_columns is not None
    df = sql_task(limit=1)
    assert df is not None
Ejemplo n.º 9
0
    except client.exceptions.BucketAlreadyOwnedByYou:
        logger.info(f"Bucket {bucket_name} has already been created by you.")
        pass
    return bucket_name


# %%
# This is the first task and represents the data source. This can be any task, that fetches data, generates, modifies
# data ready for feature ingestion. These can also be arbitrary feature engineering tasks like data imputation, univariate
# selection, etc.
load_horse_colic_sql = SQLite3Task(
    name="sqlite3.load_horse_colic",
    query_template="select * from data",
    output_schema_type=FlyteSchema,
    task_config=SQLite3Config(
        uri=DATABASE_URI,
        compressed=True,
    ),
    metadata=TaskMetadata(
        cache=True,
        cache_version="1.0",
    ),
)


# %%
# We define two tasks, namely ``store_offline`` and ``load_historical_features`` to store and retrieve the historial
# features.
#
# .. list-table:: Decoding the ``Feast`` Nomenclature
#    :widths: 25 25
#
Ejemplo n.º 10
0
import pandas

from flytekit import kwtypes, task, workflow
from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task

# https://www.sqlitetutorial.net/sqlite-sample-database/
from flytekit.types.schema import FlyteSchema

EXAMPLE_DB = "https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip"

# This task belongs to test_task_static but is intentionally here to help test tracking
tk = SQLite3Task(
    "test",
    query_template="select * from tracks",
    task_config=SQLite3Config(
        uri=EXAMPLE_DB,
        compressed=True,
    ),
)


def test_task_static():
    assert tk.output_columns is None

    df = tk()
    assert df is not None


def test_task_schema():
    # sqlite3_start
    DB_LOCATION = "https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip"
Ejemplo n.º 11
0
    name="great_expectations_task_schema",
    datasource_name="data",
    inputs=kwtypes(dataset=FlyteSchema),
    expectation_suite_name="sqlite.movies",
    data_connector_name="data_flytetype_data_connector",
    local_file_path="/tmp/test.parquet",
    context_root_dir=CONTEXT_ROOT_DIR,
)

# %%
# Let's fetch the DataFrame from the SQL Database we've with us. To do so, we use the ``SQLite3Task`` available within Flyte.
sql_to_df = SQLite3Task(
    name="greatexpectations.task.sqlite3",
    query_template="select * from movies",
    output_schema_type=FlyteSchema,
    task_config=SQLite3Config(uri=SQLITE_DATASET),
)

# %%
# Next, we define a task that validates the data and returns the columns in it.
@task(limits=Resources(mem="500Mi"))
def schema_task(dataset: pd.DataFrame) -> typing.List[str]:
    schema_task_object(dataset=dataset)
    return list(dataset.columns)


# %%
# Finally, we define a workflow to fetch the DataFrame and validate it.
@workflow
def schema_wf() -> typing.List[str]:
    df = sql_to_df()