Beispiel #1
0
def test_ge_schema_runtimebatchrequest_sqlite_config():
    ge_config = GreatExpectationsFlyteConfig(
        datasource_name="sqlite_data",
        expectation_suite_name="sqlite.movies",
        data_connector_name="sqlite_data_connector",
        data_asset_name="sqlite_data",
        batch_request_config=BatchRequestConfig(batch_identifiers={
            "pipeline_stage": "validation",
        }, ),
    )

    @task
    def my_task(sqlite_db: GreatExpectationsType[str, ge_config]) -> int:
        # read sqlite query results into a pandas DataFrame
        con = sqlite3.connect(os.path.join("data/movies.sqlite"))
        df = pd.read_sql_query("SELECT * FROM movies", con)
        con.close()

        # verify that result of SQL query is stored in the dataframe
        return len(df)

    @workflow
    def my_wf() -> int:
        return my_task(sqlite_db="SELECT * FROM movies")

    result = my_wf()
    assert result == 2736
Beispiel #2
0
 def my_task(
     dataframe: GreatExpectationsType[
         FlyteSchema,
         GreatExpectationsFlyteConfig(
             datasource_name="data",
             expectation_suite_name="test.demo",
             data_connector_name="data_flytetype_data_connector",
             batch_request_config=BatchRequestConfig(
                 data_connector_query={"limit": 10}),
             local_file_path="/tmp/test3.parquet",  # noqa: F722
         ), ]
 ) -> int:
     return dataframe.open().all().shape[0]
Beispiel #3
0
def schema_task(
    dataframe: GreatExpectationsType[
        FlyteSchema,
        GreatExpectationsFlyteConfig(
            datasource_name="data",
            expectation_suite_name="test.demo",
            data_connector_name="data_flytetype_data_connector",
            batch_request_config=BatchRequestConfig(
                data_connector_query={"limit": 10}),
            local_file_path="/tmp/test.parquet",  # noqa: F722
            context_root_dir=CONTEXT_ROOT_DIR,
        ), ]
) -> int:
    return dataframe.shape[0]
Beispiel #4
0
 def my_task(
     directory: GreatExpectationsType[
         str,
         GreatExpectationsFlyteConfig(
             datasource_name="data",
             expectation_suite_name="test.demo",
             data_connector_name="my_data_connector",
             batch_request_config=BatchRequestConfig(
                 data_connector_query={
                     "batch_filter_parameters": {
                         "year": "2019",
                         "month": "01",  # noqa: F722
                     },
                     "limit": 10,
                 }, ),
         ), ]
 ) -> str:
     return directory
Beispiel #5
0
def simple_task(
    directory: GreatExpectationsType[
        str,
        GreatExpectationsFlyteConfig(
            datasource_name="data",
            expectation_suite_name="test.demo",
            data_connector_name="my_data_connector",
            batch_request_config=BatchRequestConfig(
                data_connector_query={
                    "batch_filter_parameters": {
                        "year": "2019",
                        "month": "01",  # noqa: F722
                    },
                    "limit": 10,
                }, ),
            context_root_dir=CONTEXT_ROOT_DIR,
        ), ]
) -> str:
    return f"Validation works for {directory}!"
Beispiel #6
0
def test_invalid_ge_batchrequest_pandas_config():
    task_object = GreatExpectationsTask(
        name="test3",
        datasource_name="data",
        inputs=kwtypes(data=str),
        expectation_suite_name="test.demo",
        data_connector_name="my_data_connector",
        task_config=BatchRequestConfig(
            data_connector_query={
                "batch_filter_parameters": {
                    "year": "2020",
                },
            }
        ),
    )

    # Capture IndexError
    with pytest.raises(InvalidBatchRequestError):
        task_object(data="my_assets")
Beispiel #7
0
def test_ge_runtimebatchrequest_sqlite_config():
    task_object = GreatExpectationsTask(
        name="test4",
        datasource_name="sqlite_data",
        inputs=kwtypes(dataset=str),
        expectation_suite_name="sqlite.movies",
        data_connector_name="sqlite_data_connector",
        data_asset_name="sqlite_data",
        task_config=BatchRequestConfig(
            batch_identifiers={
                "pipeline_stage": "validation",
            },
        ),
    )

    @workflow
    def runtime_sqlite_wf():
        task_object(dataset="SELECT * FROM movies")

    runtime_sqlite_wf()
Beispiel #8
0
def test_ge_batchrequest_pandas_config():
    task_object = GreatExpectationsTask(
        name="test2",
        datasource_name="data",
        inputs=kwtypes(data=str),
        expectation_suite_name="test.demo",
        data_connector_name="my_data_connector",
        task_config=BatchRequestConfig(
            data_connector_query={
                "batch_filter_parameters": {
                    "year": "2019",
                    "month": "01",
                },
                "limit": 10,
            },
        ),
    )

    # name of the asset -- can be found in great_expectations.yml file
    task_object(data="my_assets")
Beispiel #9
0
def test_ge_runtimebatchrequest_pandas_config():
    task_object = GreatExpectationsTask(
        name="test5",
        datasource_name="my_pandas_datasource",
        inputs=kwtypes(dataset=FlyteSchema),
        expectation_suite_name="test.demo",
        data_connector_name="my_runtime_data_connector",
        data_asset_name="pandas_data",
        task_config=BatchRequestConfig(
            batch_identifiers={
                "pipeline_stage": "validation",
            },
        ),
    )

    @workflow
    def runtime_pandas_wf(df: pd.DataFrame):
        task_object(dataset=df)

    runtime_pandas_wf(df=pd.read_csv("data/yellow_tripdata_sample_2019-01.csv"))
Beispiel #10
0
def test_ge_runtimebatchrequest_pandas_config():
    ge_config = GreatExpectationsFlyteConfig(
        datasource_name="my_pandas_datasource",
        expectation_suite_name="test.demo",
        data_connector_name="my_runtime_data_connector",
        data_asset_name="pandas_data",
        batch_request_config=BatchRequestConfig(batch_identifiers={
            "pipeline_stage": "validation",
        }, ),
    )

    @task
    def my_task(
            pandas_df: GreatExpectationsType[FlyteSchema, ge_config]) -> int:
        return len(pandas_df.open().all())

    @workflow
    def runtime_pandas_wf(df: pd.DataFrame):
        my_task(pandas_df=df)

    runtime_pandas_wf(
        df=pd.read_csv("data/yellow_tripdata_sample_2019-01.csv"))
Beispiel #11
0
def test_invalid_ge_schema_batchrequest_pandas_config():
    ge_config = GreatExpectationsFlyteConfig(
        datasource_name="data",
        expectation_suite_name="test.demo",
        data_connector_name="my_data_connector",
        batch_request_config=BatchRequestConfig(data_connector_query={
            "batch_filter_parameters": {
                "year": "2020",
            },
        }),
    )

    @task
    def my_task(directory: GreatExpectationsType[str, ge_config]) -> str:
        return directory

    @workflow
    def my_wf():
        my_task(directory="my_assets")

    # Capture IndexError
    with pytest.raises(InvalidBatchRequestError):
        my_wf()
Beispiel #12
0
# .. note::
#   The plugin determines the type of request as ``RuntimeBatchRequest`` by analyzing the user-given data connector.
#
# We instantiate ``data_asset_name`` to associate it with the ``RuntimeBatchRequest``.
# The typical Great Expectations' batch_data (or) query is automatically populated with the dataset.
#
# .. note::
#   If you want to load a database table as a batch, your dataset has to be a SQL query.
runtime_ge_config = GreatExpectationsFlyteConfig(
    datasource_name="my_pandas_datasource",
    expectation_suite_name="test.demo",
    data_connector_name="my_runtime_data_connector",
    data_asset_name="validate_pandas_data",
    batch_request_config=BatchRequestConfig(
        batch_identifiers={
            "pipeline_stage": "validation",
        },
    ),
    context_root_dir=CONTEXT_ROOT_DIR,
)


# %%
# We define a task to generate DataFrame from the CSV file.
@task
def runtime_to_df_task(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(os.path.join("greatexpectations", "data", csv_file))
    return df


# %%