def test_schema_validation():
    conn = {
        "source_type": "MSSQL",
        "host": SQL_SERVER_HOST,
        "user": SQL_SERVER_USER,
        "password": SQL_SERVER_PASSWORD,
        "port": 1433,
        "database": "guestbook",
    }

    config = {
        consts.CONFIG_SOURCE_CONN: conn,
        consts.CONFIG_TARGET_CONN: conn,
        consts.CONFIG_TYPE: "Schema",
        consts.CONFIG_SCHEMA_NAME: "dbo",
        consts.CONFIG_TABLE_NAME: "entries",
        consts.CONFIG_FORMAT: "table",
    }

    validator = data_validation.DataValidation(config, verbose=True)
    df = validator.execute()

    for validation in df.to_dict(orient="records"):
        assert validation[
            "validation_status"] == consts.VALIDATION_STATUS_SUCCESS
Example #2
0
def test_count_validator(count_config):
    validator = data_validation.DataValidation(count_config, verbose=True)
    df = validator.execute()

    count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0]
    count_string_value = df[df["validation_name"] == "count_string_col"][
        "source_agg_value"
    ].values[0]
    avg_float_value = df[df["validation_name"] == "avg_float_col"][
        "source_agg_value"
    ].values[0]
    max_timestamp_value = df[df["validation_name"] == "max_timestamp_col"][
        "source_agg_value"
    ].values[0]
    min_int_value = df[df["validation_name"] == "min_int_col"][
        "source_agg_value"
    ].values[0]

    assert float(count_value) > 0
    assert float(count_string_value) > 0
    assert float(avg_float_value) > 0
    assert datetime.datetime.strptime(
        max_timestamp_value,
        "%Y-%m-%d %H:%M:%S%z",
    ) > datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
    assert float(min_int_value) > 0
Example #3
0
def test_count_validator():
    validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True)
    df = validator.execute()

    count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0]
    count_tripduration_value = df[df["validation_name"] == "count_tripduration"][
        "source_agg_value"
    ].values[0]
    avg_tripduration_value = df[df["validation_name"] == "avg_tripduration"][
        "source_agg_value"
    ].values[0]
    max_birth_year_value = df[df["validation_name"] == "max_birth_year"][
        "source_agg_value"
    ].values[0]
    min_birth_year_value = df[df["validation_name"] == "min_birth_year"][
        "source_agg_value"
    ].values[0]

    assert float(count_value) > 0
    assert float(count_tripduration_value) > 0
    assert float(avg_tripduration_value) > 0
    assert float(max_birth_year_value) > 0
    assert float(min_birth_year_value) > 0
    assert (
        df["source_agg_value"].astype(float).sum()
        == df["target_agg_value"].astype(float).sum()
    )
Example #4
0
def test_numeric_types():
    validator = data_validation.DataValidation(CONFIG_NUMERIC_AGG_VALID, verbose=True)
    df = validator.execute()

    for validation in df.to_dict(orient="records"):
        assert float(validation["source_agg_value"]) == float(
            validation["target_agg_value"]
        )
def test_mysql_count_invalid_host():
    try:
        data_validator = data_validation.DataValidation(
            CONFIG_COUNT_VALID,
            verbose=False,
        )
        df = data_validator.execute()
        assert df["source_agg_value"][0] == df["target_agg_value"][0]
    except exceptions.DataClientConnectionFailure:
        # Local Testing will not work for MySQL
        pass
def test_schema_validation():
    try:
        data_validator = data_validation.DataValidation(
            CONFIG_SCHEMA_VALID,
            verbose=False,
        )
        df = data_validator.execute()

        for validation in df.to_dict(orient="records"):
            assert validation[
                "validation_status"] == consts.VALIDATION_STATUS_SUCCESS
    except exceptions.DataClientConnectionFailure:
        # Local Testing will not work for MySQL
        pass
Example #7
0
def test_grouped_count_validator(grouped_config):
    validator = data_validation.DataValidation(grouped_config, verbose=True)
    df = validator.execute()
    rows = list(df[df["validation_name"] == "count"].iterrows())

    # Check that all partitions are unique.
    partitions = frozenset(df["group_by_columns"])
    assert len(rows) == len(partitions)
    assert len(rows) > 1
    assert df["source_agg_value"].sum() == df["target_agg_value"].sum()

    for _, row in rows:
        assert float(row["source_agg_value"]) > 0
        assert row["source_agg_value"] == row["target_agg_value"]
Example #8
0
def test_postgres_count(cloud_sql):
    """Test count validation on Postgres instance"""
    config_count_valid = {
        # BigQuery Specific Connection Config
        consts.CONFIG_SOURCE_CONN:
        CONN,
        consts.CONFIG_TARGET_CONN:
        CONN,
        # Validation Type
        consts.CONFIG_TYPE:
        "Column",
        # Configuration Required Depending on Validator Type
        consts.CONFIG_SCHEMA_NAME:
        "public",
        consts.CONFIG_TABLE_NAME:
        "entries",
        consts.CONFIG_AGGREGATES: [
            {
                consts.CONFIG_TYPE: "count",
                consts.CONFIG_SOURCE_COLUMN: None,
                consts.CONFIG_TARGET_COLUMN: None,
                consts.CONFIG_FIELD_ALIAS: "count",
            },
            {
                consts.CONFIG_TYPE: "count",
                consts.CONFIG_SOURCE_COLUMN: "guestname",
                consts.CONFIG_TARGET_COLUMN: "guestname",
                consts.CONFIG_FIELD_ALIAS: "count_guestname",
            },
            {
                consts.CONFIG_TYPE: "sum",
                consts.CONFIG_SOURCE_COLUMN: "entryid",
                consts.CONFIG_TARGET_COLUMN: "entryid",
                consts.CONFIG_FIELD_ALIAS: "sum_entryid",
            },
        ],
        consts.CONFIG_FORMAT:
        "table",
    }

    data_validator = data_validation.DataValidation(
        config_count_valid,
        verbose=False,
    )
    df = data_validator.execute()

    assert df["source_agg_value"].equals(df["target_agg_value"])
    assert sorted(list(df["source_agg_value"])) == ["28", "7", "7"]
def test_sql_server_count(cloud_sql):
    """Test count validation on SQL Server instance"""
    conn = {
        "source_type": "MSSQL",
        "host": SQL_SERVER_HOST,
        "user": SQL_SERVER_USER,
        "password": SQL_SERVER_PASSWORD,
        "port": 1433,
        "database": "guestbook",
    }

    config_count_valid = {
        # BigQuery Specific Connection Config
        consts.CONFIG_SOURCE_CONN:
        conn,
        consts.CONFIG_TARGET_CONN:
        conn,
        # Validation Type
        consts.CONFIG_TYPE:
        "Column",
        # Configuration Required Depending on Validator Type
        consts.CONFIG_SCHEMA_NAME:
        "dbo",
        consts.CONFIG_TABLE_NAME:
        "entries",
        consts.CONFIG_AGGREGATES: [
            {
                consts.CONFIG_TYPE: "count",
                consts.CONFIG_SOURCE_COLUMN: None,
                consts.CONFIG_TARGET_COLUMN: None,
                consts.CONFIG_FIELD_ALIAS: "count",
            },
        ],
        consts.CONFIG_FORMAT:
        "table",
    }

    data_validator = data_validation.DataValidation(
        config_count_valid,
        verbose=False,
    )
    df = data_validator.execute()
    assert df["source_agg_value"][0] == df["target_agg_value"][0]
Example #10
0
def test_schema_validation(cloud_sql):
    """Test schema validation on Postgres instance"""
    config_count_valid = {
        consts.CONFIG_SOURCE_CONN: CONN,
        consts.CONFIG_TARGET_CONN: CONN,
        consts.CONFIG_TYPE: "Schema",
        consts.CONFIG_SCHEMA_NAME: "public",
        consts.CONFIG_TABLE_NAME: "entries",
        consts.CONFIG_FORMAT: "table",
    }

    data_validator = data_validation.DataValidation(
        config_count_valid,
        verbose=False,
    )
    df = data_validator.execute()

    for validation in df.to_dict(orient="records"):
        assert validation[
            "validation_status"] == consts.VALIDATION_STATUS_SUCCESS
Example #11
0
def test_execute(module_under_test, fs):
    num_rows = 1
    source_data = _generate_fake_data(rows=num_rows, second_range=0)
    _create_table_file(SOURCE_TABLE_FILE_PATH, _get_fake_json_data(source_data))

    # Create target data with new field
    target_data = _generate_fake_data(
        rows=num_rows, second_range=0, rename_columns={"id": "id_new"}
    )
    _create_table_file(TARGET_TABLE_FILE_PATH, _get_fake_json_data(target_data))

    dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True)
    result_df = dv_client.schema_validator.execute()
    failures = result_df[
        result_df["validation_status"].str.contains(consts.VALIDATION_STATUS_FAIL)
    ]
    assert len(result_df) == len(source_data[0]) + 1
    assert result_df["source_agg_value"].astype(float).sum() == 7
    assert result_df["target_agg_value"].astype(float).sum() == 7
    assert result_df.labels[0] == SAMPLE_SCHEMA_CONFIG[consts.CONFIG_LABELS]
    assert failures["source_column_name"].to_list() == ["id", "N/A"]
    assert failures["target_column_name"].to_list() == ["N/A", "id_new"]
Example #12
0
    def validation_function(project):
        from data_validation import data_validation
        from data_validation.result_handlers import bigquery as bqhandler

        BQ_CONN = {"source_type": "BigQuery", "project_id": project}

        GROUPED_CONFIG_COUNT_VALID = {
            # BigQuery Specific Connection Config
            "source_conn":
            BQ_CONN,
            "target_conn":
            BQ_CONN,
            # Validation Type
            "type":
            "Column",
            # Configuration Required Depending on Validator Type
            "schema_name":
            "bigquery-public-data.new_york_citibike",
            "table_name":
            "citibike_trips",
            "aggregates": [
                {
                    "field_alias": "count",
                    # Tool will run a 'COUNT *' as the default aggregation
                    "source_column": None,
                    "target_column": None,
                    "type": "count",
                },
            ],
        }

        handler = bqhandler.BigQueryResultHandler.get_handler_for_project(
            project)
        validator = data_validation.DataValidation(GROUPED_CONFIG_COUNT_VALID,
                                                   verbose=True,
                                                   result_handler=handler)
        validator.execute()
Example #13
0
def test_schema_validation():
    validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True)
    df = validator.execute()

    for validation in df.to_dict(orient="records"):
        assert validation["validation_status"] == consts.VALIDATION_STATUS_SUCCESS
Example #14
0
def test_count_validator():
    validator = data_validation.DataValidation(TERADATA_CONFIG, verbose=True)
    df = validator.execute()
    assert int(df["source_agg_value"][0]) > 0
    assert df["source_agg_value"][0] == df["target_agg_value"][0]
Example #15
0
            "target_column": "tripduration",
            "type": "sum",
        },
        {
            "field_alias": "max__tripduration",
            "source_column": "tripduration",
            "target_column": "tripduration",
            "type": "max",
        },
        {
            "field_alias": "min__tripduration",
            "source_column": "tripduration",
            "target_column": "tripduration",
            "type": "min",
        },
        {
            "field_alias": "avg__tripduration",
            "source_column": "tripduration",
            "target_column": "tripduration",
            "type": "avg",
        },
    ],
}

result_handler = bqhandler.BigQueryResultHandler.get_handler_for_project(
    PROJECT_ID)
validator = data_validation.DataValidation(CONFIG_COUNT_VALID,
                                           verbose=True,
                                           result_handler=result_handler)
df = validator.execute()
Example #16
0
def test_timestamp_aggs():
    validator = data_validation.DataValidation(CONFIG_TIMESTAMP_AGGS)
    df = validator.execute()
    for validation in df.to_dict(orient="records"):
        assert validation["source_agg_value"] == validation["target_agg_value"]
Example #17
0
def validate(config):
    """Run Data Validation against the supplied config."""
    validator = data_validation.DataValidation(config)
    df = validator.execute()

    return _clean_dataframe(df)