Ejemplo n.º 1
0
def test_example_F(test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    data_assets:
        table_partitioned_by_foreign_key__F:
            splitter_method: _split_on_column_value
            splitter_kwargs:
                column_name: session_id
    """, )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["table_partitioned_by_foreign_key__F"],
        "data_assets": {
            "table_partitioned_by_foreign_key__F": {
                "batch_definition_count":
                49,
                # TODO Abe 20201029 : These values should be sorted
                "example_data_references": [
                    {
                        "session_id": 3
                    },
                    {
                        "session_id": 2
                    },
                    {
                        "session_id": 4
                    },
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {
            "n_rows": 2,
            "batch_spec": {
                "table_name": "table_partitioned_by_foreign_key__F",
                "partition_definition": {
                    "session_id": 2
                },
                "splitter_method": "_split_on_column_value",
                "splitter_kwargs": {
                    "column_name": "session_id"
                },
            },
        },
    }
Ejemplo n.º 2
0
def test_example_A(test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    data_assets:
        table_partitioned_by_date_column__A:
            splitter_method: _split_on_column_value
            splitter_kwargs:
                column_name: date

    """, )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["table_partitioned_by_date_column__A"],
        "data_assets": {
            "table_partitioned_by_date_column__A": {
                "batch_definition_count":
                30,
                "example_data_references": [
                    {
                        "date": "2020-01-01"
                    },
                    {
                        "date": "2020-01-02"
                    },
                    {
                        "date": "2020-01-03"
                    },
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        "example_data_reference": {
            "n_rows": 8,
            "batch_spec": {
                "table_name": "table_partitioned_by_date_column__A",
                "partition_definition": {
                    "date": "2020-01-02"
                },
                "splitter_method": "_split_on_column_value",
                "splitter_kwargs": {
                    "column_name": "date"
                },
            },
        },
    }
def test_basic_self_check(
        test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    assets:
        table_partitioned_by_date_column__A:
            #table_name: events # If table_name is omitted, then the table_name defaults to the asset name
            splitter_method: _split_on_column_value
            splitter_kwargs:
                column_name: date
    """, )
    config["execution_engine"] = execution_engine

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["table_partitioned_by_date_column__A"],
        "data_assets": {
            "table_partitioned_by_date_column__A": {
                "batch_definition_count":
                30,
                "example_data_references": [
                    {
                        "date": "2020-01-01"
                    },
                    {
                        "date": "2020-01-02"
                    },
                    {
                        "date": "2020-01-03"
                    },
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {
        #     "n_rows": 8,
        #     "batch_spec": {
        #         "table_name": "table_partitioned_by_date_column__A",
        #         "data_asset_name": "table_partitioned_by_date_column__A",
        #         "batch_identifiers": {"date": "2020-01-02"},
        #         "splitter_method": "_split_on_column_value",
        #         "splitter_kwargs": {"column_name": "date"},
        #     },
        # },
    }
def test_example_G(test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    assets:
        table_partitioned_by_multiple_columns__G:
            splitter_method: _split_on_multi_column_values
            splitter_kwargs:
                column_names:
                    - y
                    - m
                    - d
    """,
    )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": ["table_partitioned_by_multiple_columns__G"],
        "data_assets": {
            "table_partitioned_by_multiple_columns__G": {
                "batch_definition_count": 30,
                # TODO Abe 20201029 : These values should be sorted
                "example_data_references": [
                    {"y": 2020, "m": 1, "d": 1},
                    {"y": 2020, "m": 1, "d": 2},
                    {"y": 2020, "m": 1, "d": 3},
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {
        #     "n_rows": 8,
        #     "batch_spec": {
        #         "table_name": "table_partitioned_by_multiple_columns__G",
        #         "data_asset_name": "table_partitioned_by_multiple_columns__G",
        #         "batch_identifiers": {
        #             "y": 2020,
        #             "m": 1,
        #             "d": 2,
        #         },
        #         "splitter_method": "_split_on_multi_column_values",
        #         "splitter_kwargs": {"column_names": ["y", "m", "d"]},
        #     },
        # },
    }
def test_example_B(test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load("""
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    assets:
        table_partitioned_by_timestamp_column__B:
            splitter_method: _split_on_converted_datetime
            splitter_kwargs:
                column_name: timestamp
    """)
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names":
        ["table_partitioned_by_timestamp_column__B"],
        "data_assets": {
            "table_partitioned_by_timestamp_column__B": {
                "batch_definition_count":
                30,
                "example_data_references": [
                    {
                        "timestamp": "2020-01-01"
                    },
                    {
                        "timestamp": "2020-01-02"
                    },
                    {
                        "timestamp": "2020-01-03"
                    },
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {
        #     "n_rows": 8,
        #     "batch_spec": {
        #         "table_name": "table_partitioned_by_timestamp_column__B",
        #         "data_asset_name": "table_partitioned_by_timestamp_column__B",
        #         "batch_identifiers": {"timestamp": "2020-01-02"},
        #         "splitter_method": "_split_on_converted_datetime",
        #         "splitter_kwargs": {"column_name": "timestamp"},
        #     },
        # },
    }
def test_example_C(
    splitter_method_name_prefix,
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        f"""
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    assets:
        table_partitioned_by_regularly_spaced_incrementing_id_column__C:
            splitter_method: {splitter_method_name_prefix}split_on_divided_integer
            splitter_kwargs:
                column_name: id
                divisor: 10
    """,
    )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name": "ConfiguredAssetSqlDataConnector",
        "data_asset_count": 1,
        "example_data_asset_names": [
            "table_partitioned_by_regularly_spaced_incrementing_id_column__C"
        ],
        "data_assets": {
            "table_partitioned_by_regularly_spaced_incrementing_id_column__C": {
                "batch_definition_count": 12,
                "example_data_references": [
                    {"id": 0},
                    {"id": 1},
                    {"id": 2},
                ],
            }
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {
        #     "n_rows": 10,
        #     "batch_spec": {
        #         "table_name": "table_partitioned_by_regularly_spaced_incrementing_id_column__C",
        #         "data_asset_name": "table_partitioned_by_regularly_spaced_incrementing_id_column__C",
        #         "batch_identifiers": {"id": 1},
        #         "splitter_method": "_split_on_divided_integer",
        #         "splitter_kwargs": {"column_name": "id", "divisor": 10},
        #     },
        # },
    }
def test_more_complex_instantiation_of_ConfiguredAssetSqlDataConnector_include_schema_name_prefix_suffix(
    splitter_method_name_prefix,
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    my_data_connector: ConfiguredAssetSqlDataConnector = ConfiguredAssetSqlDataConnector(
        name="my_sql_data_connector",
        datasource_name="my_test_datasource",
        execution_engine="test_cases_for_sql_data_connector_sqlite_execution_engine",
        assets={
            "table_partitioned_by_date_column__A": {
                "splitter_method": f"{splitter_method_name_prefix}split_on_column_value",
                "splitter_kwargs": {"column_name": "date"},
                "include_schema_name": True,
                "schema_name": "main",
                "data_asset_name_prefix": "taxi__",
                "data_asset_name_suffix": "__asset",
            },
        },
    )
    assert (
        "taxi__main.table_partitioned_by_date_column__A__asset"
        in my_data_connector.assets
    )

    # schema_name provided, but include_schema_name is set to False
    with pytest.raises(ge_exceptions.DataConnectorError) as e:
        ConfiguredAssetSqlDataConnector(
            name="my_sql_data_connector",
            datasource_name="my_test_datasource",
            execution_engine="test_cases_for_sql_data_connector_sqlite_execution_engine",
            assets={
                "table_partitioned_by_date_column__A": {
                    "splitter_method": f"{splitter_method_name_prefix}split_on_column_value",
                    "splitter_kwargs": {"column_name": "date"},
                    "include_schema_name": False,
                    "schema_name": "main",
                    "data_asset_name_prefix": "taxi__",
                    "data_asset_name_suffix": "__asset",
                },
            },
        )
    assert (
        e.value.message
        == "ConfiguredAssetSqlDataConnector ran into an error while initializing Asset names. Schema main was specified, but 'include_schema_name' flag was set to False."
    )
def test_behavior_with_whole_table_splitter(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    assets:
        table_partitioned_by_date_column__A:
            splitter_method : "_split_on_whole_table"
            splitter_kwargs : {}
    """,
    )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)
    report_object = my_data_connector.self_check()
    print(json.dumps(report_object, indent=2))

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
            )
        )
    )
    assert len(batch_definition_list) == 1
    assert batch_definition_list[0]["batch_identifiers"] == {}

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
                data_connector_query={},
            )
        )
    )
    assert len(batch_definition_list) == 1
    assert batch_definition_list[0]["batch_identifiers"] == {}

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
                data_connector_query={"batch_filter_parameters": {}},
            )
        )
    )
    assert len(batch_definition_list) == 1
    assert batch_definition_list[0]["batch_identifiers"] == {}
Ejemplo n.º 9
0
def test_get_batch_definition_list_from_batch_request(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    data_assets:
        table_partitioned_by_date_column__A:
            splitter_method: _split_on_column_value
            splitter_kwargs:
                column_name: date

    """, )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)
    my_data_connector._refresh_data_references_cache()

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
                partition_request={
                    "partition_identifiers": {
                        "date": "2020-01-01"
                    }
                },
            )))
    assert len(batch_definition_list) == 1

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
                partition_request={"partition_identifiers": {}},
            )))
    assert len(batch_definition_list) == 30

    # Note: Abe 20201109: It would be nice to put in safeguards for mistakes like this.
    # In this case, "date" should go inside "partition_identifiers".
    # Currently, the method ignores "date" entirely, and matches on too many partitions.
    # I don't think this is unique to ConfiguredAssetSqlDataConnector.
    # with pytest.raises(DataConnectorError) as e:
    #     batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
    #         batch_request=BatchRequest(
    #             datasource_name="FAKE_Datasource_NAME",
    #             data_connector_name="my_sql_data_connector",
    #             data_asset_name="table_partitioned_by_date_column__A",
    #             partition_request={
    #                 "partition_identifiers" : {},
    #                 "date" : "2020-01-01",
    #             }
    #     ))
    # assert "Unmatched key" in e.value.message

    batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
                data_asset_name="table_partitioned_by_date_column__A",
            )))
    assert len(batch_definition_list) == 30

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_Datasource_NAME",
                data_connector_name="my_sql_data_connector",
            ))

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(datasource_name="FAKE_Datasource_NAME",
                                       ))

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest())
Ejemplo n.º 10
0
def test_example_E(test_cases_for_sql_data_connector_sqlite_execution_engine):
    random.seed(0)
    db = test_cases_for_sql_data_connector_sqlite_execution_engine

    config = yaml.load(
        """
    name: my_sql_data_connector
    datasource_name: FAKE_Datasource_NAME

    data_assets:
        table_partitioned_by_incrementing_batch_id__E:
            splitter_method: _split_on_column_value
            splitter_kwargs:
                column_name: batch_id
    """, )
    config["execution_engine"] = db

    my_data_connector = ConfiguredAssetSqlDataConnector(**config)

    report = my_data_connector.self_check()
    print(json.dumps(report, indent=2))

    assert report == {
        "class_name":
        "ConfiguredAssetSqlDataConnector",
        "data_asset_count":
        1,
        "example_data_asset_names":
        ["table_partitioned_by_incrementing_batch_id__E"],
        "data_assets": {
            "table_partitioned_by_incrementing_batch_id__E": {
                "batch_definition_count":
                11,
                "example_data_references": [
                    {
                        "batch_id": 0
                    },
                    {
                        "batch_id": 1
                    },
                    {
                        "batch_id": 2
                    },
                ],
            }
        },
        "unmatched_data_reference_count":
        0,
        "example_unmatched_data_references": [],
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {
        #     "n_rows": 9,
        #     "batch_spec": {
        #         "table_name": "table_partitioned_by_incrementing_batch_id__E",
        #         "data_asset_name": "table_partitioned_by_incrementing_batch_id__E",
        #         "partition_definition": {"batch_id": 1},
        #         "splitter_method": "_split_on_column_value",
        #         "splitter_kwargs": {"column_name": "batch_id"},
        #     },
        # },
    }
Ejemplo n.º 11
0
                "class_name": "SqlAlchemyExecutionEngine",
                "connection_string": connection_string,
            },
        )

        # 2. Set sampler in data connector config
        data_connector_name: str = "test_data_connector"
        data_asset_name: str = table_name  # Read from generated table name
        column_name: str = taxi_splitting_test_cases.test_column_name
        data_connector: ConfiguredAssetSqlDataConnector = (
            ConfiguredAssetSqlDataConnector(
                name=data_connector_name,
                datasource_name=datasource_name,
                execution_engine=context.datasources[datasource_name].
                execution_engine,
                assets={
                    data_asset_name: {
                        "sampling_method": test_case.sampling_method_name,
                        "sampling_kwargs": test_case.sampling_kwargs,
                    }
                },
            ))

        # 3. Check if resulting batches are as expected
        # using data_connector.get_batch_definition_list_from_batch_request()
        batch_request: BatchRequest = BatchRequest(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
        )
        batch_definition_list: List[
            BatchDefinition] = data_connector.get_batch_definition_list_from_batch_request(