コード例 #1
0
def test_config_variables_in_test_yaml_config(mock_emit,
                                              empty_data_context_stats_enabled,
                                              sa):
    context: DataContext = empty_data_context_stats_enabled

    db_file = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )

    context.save_config_variable("db_file", db_file)
    context.save_config_variable("data_connector_name",
                                 "my_very_awesome_data_connector")
    context.save_config_variable("suffix", "__whole_table")
    context.save_config_variable("sampling_n", "10")

    print(context.config_variables)

    first_config = """
class_name: SimpleSqlalchemyDatasource
connection_string: sqlite:///${db_file}

introspection:
    ${data_connector_name}:
        data_asset_name_suffix: ${suffix}
        sampling_method: _sample_using_limit
        sampling_kwargs:
            n: ${sampling_n}
"""

    my_datasource = context.test_yaml_config(first_config)
    assert ("test_cases_for_sql_data_connector.db"
            in my_datasource.execution_engine.connection_string)
    assert mock_emit.call_count == 1
    # Substitute anonymized names since it changes for each run
    anonymized_datasource_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_name"]
    anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"]
    expected_call_args_list = [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name":
                anonymized_datasource_name,
                "parent_class":
                "SimpleSqlalchemyDatasource",
                "anonymized_execution_engine": {
                    "parent_class": "SqlAlchemyExecutionEngine"
                },
                "anonymized_data_connectors": [{
                    "anonymized_name":
                    anonymized_data_connector_name,
                    "parent_class":
                    "InferredAssetSqlDataConnector",
                }],
            },
            "success": True,
        }),
    ]
    assert mock_emit.call_args_list == expected_call_args_list

    report_object = context.test_yaml_config(first_config,
                                             return_mode="report_object")
    print(json.dumps(report_object, indent=2))
    assert report_object["data_connectors"]["count"] == 1
    assert set(report_object["data_connectors"].keys()) == {
        "count",
        "my_very_awesome_data_connector",
    }
    assert mock_emit.call_count == 2
    expected_call_args_list.append(
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name":
                anonymized_datasource_name,
                "parent_class":
                "SimpleSqlalchemyDatasource",
                "anonymized_execution_engine": {
                    "parent_class": "SqlAlchemyExecutionEngine"
                },
                "anonymized_data_connectors": [{
                    "anonymized_name":
                    anonymized_data_connector_name,
                    "parent_class":
                    "InferredAssetSqlDataConnector",
                }],
            },
            "success": True,
        }), )
    assert mock_emit.call_args_list == expected_call_args_list
コード例 #2
0
def test_DataContext_raises_error_on_missing_config_version_aka_version_zero_with_v2_config(
):
    local_dir = file_relative_path(
        __file__, os.path.join(BASE_DIR, "version_2-0_but_no_version_defined"))
    with pytest.raises(ge_exceptions.InvalidDataContextConfigError):
        DataContext(local_dir)
コード例 #3
0
def test_project_upgrade_with_exception(v10_project_directory, caplog):
    # test project upgrade that requires manual steps

    # copy v2 yml
    shutil.copy(
        file_relative_path(
            __file__,
            "../../test_fixtures/upgrade_helper/great_expectations_v1_basic_with_exception"
            ".yml",
        ),
        os.path.join(v10_project_directory, "great_expectations.yml"),
    )

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["project", "upgrade", "-d", v10_project_directory],
        input="\n",
        catch_exceptions=False,
    )
    stdout = result.stdout

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/test_project_upgrade_with_exception_expected_stdout.fixture",
            )) as f:
        expected_stdout = f.read()
        expected_stdout = expected_stdout.replace("GE_PROJECT_DIR",
                                                  v10_project_directory)
        assert stdout == expected_stdout

    expected_project_tree_str = """\
great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
        .gitkeep
    expectations/
        .gitkeep
    notebooks/
        .gitkeep
    plugins/
        custom_store_backends/
            __init__.py
            my_custom_store_backend.py
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                expectations/
                    .gitkeep
                static/
                    .gitkeep
                validations/
                    diabetic_data/
                        warning/
                            20200430T191246.763896Z/
                                20200430T191246.763896Z/
                                    c3b4c5df224fef4b1a056a0f3b93aba5.html
        logs/
            project_upgrades/
                UpgradeHelperV11_20190926T134241.000000Z.json
        validations/
            diabetic_data/
                warning/
                    20200430T191246.763896Z/
                        20200430T191246.763896Z/
                            c3b4c5df224fef4b1a056a0f3b93aba5.json
"""
    obs_project_tree_str = gen_directory_tree_str(v10_project_directory)
    assert obs_project_tree_str == expected_project_tree_str
    # make sure config number not incremented
    assert (DataContext.get_ge_config_version(
        context_root_dir=v10_project_directory) == 1)

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/UpgradeHelperV11_basic_upgrade_with_exception_log"
                ".json",
            )) as f:
        expected_upgrade_log_dict = json.load(f)
        expected_upgrade_log_str = json.dumps(expected_upgrade_log_dict)
        expected_upgrade_log_str = expected_upgrade_log_str.replace(
            "GE_PROJECT_DIR", v10_project_directory)
        expected_upgrade_log_str = expected_upgrade_log_str.replace(
            "GE_PATH",
            os.path.split(great_expectations.__file__)[0])
        expected_upgrade_log_dict = json.loads(expected_upgrade_log_str)

    with open(
            f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z"
            f".json") as f:
        obs_upgrade_log_dict = json.load(f)
        obs_upgrade_log_dict["exceptions"][0]["exception_message"] = ""

    assert obs_upgrade_log_dict == expected_upgrade_log_dict
コード例 #4
0
def test_cli_init_on_existing_project_with_no_uncommitted_dirs_answering_yes_to_fixing_them(
    mock_webbrowser,
    caplog,
    tmp_path_factory,
):
    """
    This test walks through the onboarding experience.

    The user just checked an existing project out of source control and does
    not yet have an uncommitted directory.
    """
    root_dir = tmp_path_factory.mktemp("hiya")
    root_dir = str(root_dir)
    os.makedirs(os.path.join(root_dir, "data"))
    data_folder_path = os.path.join(root_dir, "data")
    data_path = os.path.join(root_dir, "data", "Titanic.csv")
    fixture_path = file_relative_path(
        __file__, os.path.join("..", "test_sets", "Titanic.csv"))
    shutil.copy(fixture_path, data_path)

    # Create a new project from scratch that we will use for the test in the next step

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", root_dir],
        input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format(
            data_folder_path, data_path),
        catch_exceptions=False,
    )
    stdout = result.output
    assert result.exit_code == 0
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/"
        .format(root_dir) in mock_webbrowser.call_args[0][0])

    assert "Great Expectations is now set up." in stdout

    context = DataContext(os.path.join(root_dir, DataContext.GE_DIR))
    uncommitted_dir = os.path.join(context.root_directory, "uncommitted")
    shutil.rmtree(uncommitted_dir)
    assert not os.path.isdir(uncommitted_dir)

    # Test the second invocation of init
    runner = CliRunner(mix_stderr=False)
    with pytest.warns(
            UserWarning,
            match="Warning. An existing `great_expectations.yml` was found"):
        result = runner.invoke(cli, ["init", "-d", root_dir],
                               input="Y\nn\n",
                               catch_exceptions=False)
    stdout = result.stdout

    assert result.exit_code == 0
    assert "Great Expectations added some missing files required to run." in stdout
    assert "You may see new files in" in stdout

    assert "OK. You must run" not in stdout
    assert "great_expectations init" not in stdout
    assert "to fix the missing files!" not in stdout
    assert "Would you like to build & view this project's Data Docs!?" in stdout

    assert os.path.isdir(uncommitted_dir)
    config_var_path = os.path.join(uncommitted_dir, "config_variables.yml")
    assert os.path.isfile(config_var_path)
    with open(config_var_path) as f:
        assert f.read() == CONFIG_VARIABLES_TEMPLATE

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #5
0
def test_DataContext_raises_error_on_config_not_found():
    local_dir = file_relative_path(__file__, os.path.join(BASE_DIR, ""))
    with pytest.raises(ge_exceptions.ConfigNotFoundError):
        DataContext(local_dir)
コード例 #6
0
def test_cli_init_on_new_project(
    mock_emit, mock_webbrowser, caplog, tmp_path_factory, monkeypatch
):
    monkeypatch.delenv(
        "GE_USAGE_STATS", raising=False
    )  # Undo the project-wide test default
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    os.makedirs(os.path.join(project_dir, "data"))
    data_folder_path = os.path.join(project_dir, "data")
    data_path = os.path.join(project_dir, "data", "Titanic.csv")
    fixture_path = file_relative_path(__file__, "../test_sets/Titanic.csv")
    shutil.copy(fixture_path, data_path)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format(data_folder_path, data_path),
        catch_exceptions=False,
    )
    stdout = result.output
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )

    assert len(stdout) < 6000, "CLI output is unreasonably long."
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "What are you processing your files with" in stdout
    assert "Enter the path of a data file (relative or absolute, s3a:// and gs:// paths are ok too)" in stdout
    assert "Name the new Expectation Suite [Titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout
    )
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert "Done generating example Expectation Suite" in stdout
    assert "Great Expectations is now set up" in stdout

    assert os.path.isdir(os.path.join(project_dir, "great_expectations"))
    config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path))
    data_source_class = config["datasources"]["data__dir"]["data_asset_type"][
        "class_name"
    ]
    assert data_source_class == "PandasDataset"

    obs_tree = gen_directory_tree_str(os.path.join(project_dir, "great_expectations"))

    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(
        r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", obs_tree
    )
    # print(guid_safe_obs_tree)
    assert (
        guid_safe_obs_tree
        == """great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
    expectations/
        Titanic/
            warning.json
    notebooks/
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    Titanic/
                        warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    Titanic/
                        warning/
                            20190926T134241.000000Z/
                                20190926T134241.000000Z/
                                    foobarbazguid.html
        validations/
            Titanic/
                warning/
                    20190926T134241.000000Z/
                        20190926T134241.000000Z/
                            foobarbazguid.json
"""
    )

    assert mock_emit.call_count == 9
    assert mock_emit.call_args_list[1] == mock.call(
        {"event_payload": {}, "event": "cli.init.create", "success": True}
    )

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #7
0
def titanic_dataset():
    df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv"))
    batch_df = PandasDataset(df)

    return batch_df
コード例 #8
0
def taxicab_context():
    return DataContext(context_root_dir=file_relative_path(
        __file__, "./configs/great_expectations_taxicab_context.yml"))
コード例 #9
0
def test_file_format_map_output():
    incomplete_file_path = file_relative_path(
        __file__, "../test_sets/toy_data_incomplete.csv")
    incomplete_file_dat = ge.data_asset.FileDataAsset(incomplete_file_path)
    null_file_path = file_relative_path(__file__, "../test_sets/null_file.csv")
    null_file_dat = ge.data_asset.FileDataAsset(null_file_path)
    white_space_path = file_relative_path(__file__,
                                          "../test_sets/white_space.txt")
    white_space_dat = ge.data_asset.FileDataAsset(white_space_path)

    # Boolean Expectation Output
    expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        skip=1,
        result_format="BOOLEAN_ONLY",
        include_config=False,
    )
    expected_result = ExpectationValidationResult(success=False)
    assert expected_result == expectation

    # Empty File Expectations
    expectation = null_file_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        skip=1,
        result_format="BASIC",
        include_config=False,
    )
    expected_result = ExpectationValidationResult(
        success=None,
        result={
            "element_count": 0,
            "missing_count": 0,
            "missing_percent": None,
            "unexpected_count": 0,
            "unexpected_percent": None,
            "unexpected_percent_nonmissing": None,
            "partial_unexpected_list": [],
        },
    )

    assert expected_result == expectation

    # White Space File
    expectation = white_space_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        result_format="BASIC",
        include_config=False)
    expected_result = ExpectationValidationResult(
        success=None,
        result={
            "element_count": 11,
            "missing_count": 11,
            "missing_percent": 100.0,
            "unexpected_count": 0,
            "unexpected_percent": 0,
            "unexpected_percent_nonmissing": None,
            "partial_unexpected_list": [],
        },
    )

    assert expected_result == expectation

    # Complete Result Format
    expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        skip=1,
        result_format="COMPLETE",
        include_config=False,
    )

    expected_result = ExpectationValidationResult(
        success=False,
        result={
            "element_count":
            9,
            "missing_count":
            2,
            "missing_percent": (2 / 9 * 100),
            "unexpected_count":
            3,
            "unexpected_percent": (3 / 9 * 100),
            "unexpected_percent_nonmissing": (3 / 7 * 100),
            "partial_unexpected_list": ["A,C,1\n", "B,1,4\n", "A,1,4\n"],
            "partial_unexpected_counts": [
                {
                    "value": "A,1,4\n",
                    "count": 1
                },
                {
                    "value": "A,C,1\n",
                    "count": 1
                },
                {
                    "value": "B,1,4\n",
                    "count": 1
                },
            ],
            "partial_unexpected_index_list": [0, 3, 5],
            "unexpected_list": ["A,C,1\n", "B,1,4\n", "A,1,4\n"],
            "unexpected_index_list": [0, 3, 5],
        },
    )

    assert expected_result == expectation

    # Invalid Result Format
    with pytest.raises(ValueError):
        expectation = incomplete_file_dat.expect_file_line_regex_match_count_to_equal(
            regex=r",\S",
            expected_count=3,
            skip=1,
            result_format="JOKE",
            include_config=False,
        )
コード例 #10
0
def bobby_columnar_table_multi_batch():
    """
    # TODO: <Alex>ALEX -- Add DocString</Alex>
    """

    verbose_profiler_config_file_path: str = file_relative_path(
        __file__, "bobby_user_workflow_verbose_profiler_config.yml"
    )
    verbose_profiler_config: str
    with open(verbose_profiler_config_file_path) as f:
        verbose_profiler_config = f.read()

    my_row_count_range_rule_expectation_configurations_oneshot_sampling_method: List[
        ExpectationConfiguration
    ] = [
        ExpectationConfiguration(
            **{
                "kwargs": {"min_value": 7505, "max_value": 8495},
                "expectation_type": "expect_table_row_count_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "table.row_count",
                            "domain_kwargs": {},
                        },
                        "num_batches": 2,
                    },
                },
            },
        ),
    ]

    my_column_ranges_rule_expectation_configurations_oneshot_sampling_method: List[
        ExpectationConfiguration
    ] = [
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "VendorID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "VendorID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "VendorID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "VendorID",
                    "min_value": 4,
                    "max_value": 4,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "passenger_count"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "passenger_count",
                    "min_value": 0,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "passenger_count"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "passenger_count",
                    "min_value": 6,
                    "max_value": 6,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "trip_distance"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "trip_distance",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "trip_distance"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "trip_distance",
                    "min_value": 37.62,
                    "max_value": 57.85,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "RatecodeID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "RatecodeID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "RatecodeID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "RatecodeID",
                    "min_value": 5,
                    "max_value": 6,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "PULocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "PULocationID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "PULocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "PULocationID",
                    "min_value": 265,
                    "max_value": 265,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "DOLocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "DOLocationID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "DOLocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "DOLocationID",
                    "min_value": 265,
                    "max_value": 265,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "payment_type"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "payment_type",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "payment_type"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "payment_type",
                    "min_value": 4,
                    "max_value": 4,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "fare_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "fare_amount",
                    "min_value": -51.84,
                    "max_value": -21.16,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "fare_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "fare_amount",
                    "min_value": 228.94,
                    "max_value": 2990.05,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "extra"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "extra",
                    "min_value": -36.53,
                    "max_value": -1.18,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "extra"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "extra",
                    "min_value": 4.51,
                    "max_value": 6.99,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "mta_tax"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "mta_tax",
                    "min_value": -0.5,
                    "max_value": -0.5,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "mta_tax"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "mta_tax",
                    "min_value": 0.69,
                    "max_value": 37.32,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "tip_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tip_amount",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "tip_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tip_amount",
                    "min_value": 46.84,
                    "max_value": 74.86,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "tolls_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tolls_amount",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "tolls_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tolls_amount",
                    "min_value": 26.4,
                    "max_value": 497.67,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "improvement_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "improvement_surcharge",
                    "min_value": -0.3,
                    "max_value": -0.3,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "improvement_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "improvement_surcharge",
                    "min_value": 0.3,
                    "max_value": 0.3,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "total_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "total_amount",
                    "min_value": -52.66,
                    "max_value": -24.44,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "total_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "total_amount",
                    "min_value": 550.18,
                    "max_value": 2992.47,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "congestion_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "congestion_surcharge",
                    "min_value": -2.49,
                    "max_value": -0.01,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "congestion_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "congestion_surcharge",
                    "min_value": 0.01,
                    "max_value": 2.49,
                    "mostly": 1.0,
                },
            },
        ),
    ]

    expectation_configurations: List[ExpectationConfiguration] = []

    expectation_configurations.extend(
        my_row_count_range_rule_expectation_configurations_oneshot_sampling_method
    )
    expectation_configurations.extend(
        my_column_ranges_rule_expectation_configurations_oneshot_sampling_method
    )

    expectation_suite_name_oneshot_sampling_method: str = (
        "bobby_columnar_table_multi_batch_oneshot_sampling_method"
    )
    expected_expectation_suite_oneshot_sampling_method: ExpectationSuite = (
        ExpectationSuite(
            expectation_suite_name=expectation_suite_name_oneshot_sampling_method
        )
    )
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in expectation_configurations:
        expected_expectation_suite_oneshot_sampling_method.add_expectation(
            expectation_configuration
        )

    yaml = YAML()
    profiler_config: dict = yaml.load(verbose_profiler_config)
    expected_expectation_suite_oneshot_sampling_method.add_citation(
        comment="Suite created by Rule-Based Profiler with the configuration included.",
        profiler_config=profiler_config,
    )

    return {
        "profiler_config": verbose_profiler_config,
        "test_configuration_oneshot_sampling_method": {
            "expectation_suite_name": expectation_suite_name_oneshot_sampling_method,
            "expected_expectation_suite": expected_expectation_suite_oneshot_sampling_method,
        },
    }
コード例 #11
0
def test_validate():

    with open(
            file_relative_path(__file__,
                               "./test_sets/titanic_expectations.json")) as f:
        my_expectation_suite = expectationSuiteSchema.loads(f.read())

    with mock.patch("uuid.uuid1") as uuid:
        uuid.return_value = "1234"
        my_df = ge.read_csv(
            file_relative_path(__file__, "./test_sets/Titanic.csv"),
            expectation_suite=my_expectation_suite,
        )
    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    with mock.patch("datetime.datetime") as mock_datetime:
        mock_datetime.utcnow.return_value = datetime(1955, 11, 5)
        results = my_df.validate(catch_exceptions=False)

    with open(
            file_relative_path(
                __file__,
                "./test_sets/titanic_expected_data_asset_validate_results.json"
            )) as f:
        expected_results = expectationSuiteValidationResultSchema.loads(
            f.read())

    del results.meta["great_expectations.__version__"]

    assert expected_results == results

    # Now, change the results and ensure they are no longer equal
    results.results[0] = ExpectationValidationResult()
    assert expected_results != results

    # Finally, confirm that only_return_failures works
    # and does not affect the "statistics" field.
    with mock.patch("datetime.datetime") as mock_datetime:
        mock_datetime.utcnow.return_value = datetime(1955, 11, 5)
        validation_results = my_df.validate(only_return_failures=True)
        del validation_results.meta["great_expectations.__version__"]

    expected_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "titanic",
            "run_id": "19551105T000000.000000Z",
            "batch_kwargs": {
                "ge_batch_id": "1234"
            },
            "batch_markers": {},
            "batch_parameters": {},
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_in_set",
                    kwargs={
                        "column": "PClass",
                        "value_set": ["1st", "2nd", "3rd"]
                    },
                ),
                success=False,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "partial_unexpected_index_list": [456],
                    "unexpected_count": 1,
                    "unexpected_list": ["*"],
                    "unexpected_percent": 0.07616146230007616,
                    "element_count": 1313,
                    "missing_percent": 0.0,
                    "partial_unexpected_counts": [{
                        "count": 1,
                        "value": "*"
                    }],
                    "partial_unexpected_list": ["*"],
                    "unexpected_percent_nonmissing": 0.07616146230007616,
                    "missing_count": 0,
                    "unexpected_index_list": [456],
                },
            )
        ],
        success=expected_results.success,  # unaffected
        statistics=expected_results["statistics"],  # unaffected
    )
    assert expected_results == validation_results
コード例 #12
0
 def __init__(self,
              requirements_relative_base_dir: str = "../../../") -> None:
     self._requirements_relative_base_dir = file_relative_path(
         __file__, requirements_relative_base_dir)
     self._dev_requirements_prefix: str = "requirements-dev"
コード例 #13
0
import os
import shutil
import subprocess
import sys

import pytest

from assets.scripts.build_gallery import execute_shell_command
from great_expectations.data_context.util import file_relative_path

integration_test_matrix = [
    {
        "name": "pandas_two_batch_requests_two_validators",
        "base_dir": file_relative_path(__file__, "../../"),
        "data_context_dir":
        "tests/integration/fixtures/yellow_trip_data_pandas_fixture/great_expectations",
        "data_dir": "tests/test_sets/taxi_yellow_trip_data_samples",
        "user_flow_script":
        "tests/integration/fixtures/yellow_trip_data_pandas_fixture/two_batch_requests_two_validators.py",
        "expected_stderrs": "",
        "expected_stdouts": "",
    },
]


def idfn(test_configuration):
    return test_configuration.get("name")


@pytest.mark.docs
@pytest.mark.integration
コード例 #14
0
def test_snowflake_key_pair_credentials(mock_prompt,
                                        basic_sqlalchemy_datasource):
    database_key_path_pass = file_relative_path(
        __file__, "../../test_fixtures/database_key_test.p8")

    mock_prompt.side_effect = [
        "3",
        "user",
        "ABCD.us-east-1",
        "default_db",
        "default_schema",
        "xsmall",
        "public",
        database_key_path_pass,
        "test123",
    ]

    credentials = _collect_snowflake_credentials(None)

    assert credentials == {
        "drivername": "snowflake",
        "database": "default_db",
        "host": "ABCD.us-east-1",
        "private_key_path": database_key_path_pass,
        "private_key_passphrase": "test123",
        "query": {
            "role": "public",
            "schema": "default_schema",
            "warehouse": "xsmall"
        },
        "username": "******",
    }

    # making sure with the correct params the key is read correctly
    basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url(
        "snowflake", deepcopy(credentials))

    # check that with a bad pass phrase an informative message is returned to the user
    credentials["private_key_passphrase"] = "bad_pass"
    with pytest.raises(DatasourceKeyPairAuthBadPassphraseError) as e:
        basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url(
            "snowflake", deepcopy(credentials))

    assert "passphrase incorrect" in e.value.message

    # check that with no pass the key is read correctly
    database_key_path_no_pass = file_relative_path(
        __file__, "../../test_fixtures/database_key_test_no_pass.p8")
    credentials["private_key_path"] = database_key_path_no_pass
    credentials["private_key_passphrase"] = ""
    (
        sqlalchemy_uri,
        create_engine_kwargs,
    ) = basic_sqlalchemy_datasource._get_sqlalchemy_key_pair_auth_url(
        "snowflake", deepcopy(credentials))

    assert (
        str(sqlalchemy_uri) ==
        "snowflake://[email protected]/default_db?role=public&schema=default_schema&warehouse=xsmall"
    )
    assert create_engine_kwargs.get("connect_args", {}).get(
        "private_key", "")  # check that the private_key is not empty
コード例 #15
0
def test_v2_to_v3_project_upgrade_without_manual_steps(
        v20_project_directory_with_v30_configuration_and_no_checkpoints,
        caplog):
    runner: CliRunner = CliRunner(mix_stderr=False)
    result: Result = runner.invoke(
        cli,
        [
            "-c",
            v20_project_directory_with_v30_configuration_and_no_checkpoints,
            "--v3-api",
            "project",
            "upgrade",
        ],
        input="\n",
        catch_exceptions=False,
    )
    stdout: str = result.stdout

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/test_v2_to_v3_project_upgrade_without_manual_steps_expected_stdout.fixture",
            )) as f:
        expected_stdout: str = f.read()
        expected_stdout = expected_stdout.replace(
            "GE_PROJECT_DIR",
            v20_project_directory_with_v30_configuration_and_no_checkpoints,
        )
        assert stdout == expected_stdout

    expected_project_tree_str: str = """\
great_expectations/
    .gitignore
    great_expectations.yml
    expectations/
        .ge_store_backend_id
        .gitkeep
    notebooks/
        .gitkeep
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            styles/
                data_docs_custom_styles.css
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                expectations/
                    .gitkeep
                static/
                    .gitkeep
                validations/
                    diabetic_data/
                        warning/
                            20200430T191246.763896Z/
                                c3b4c5df224fef4b1a056a0f3b93aba5.html
        logs/
            project_upgrades/
                UpgradeHelperV13_20210119T132639.000000Z.json
        validations/
            .ge_store_backend_id
            diabetic_data/
                warning/
                    20200430T191246.763896Z/
                        c3b4c5df224fef4b1a056a0f3b93aba5.json
"""
    obs_project_tree_str: str = gen_directory_tree_str(
        startpath=
        v20_project_directory_with_v30_configuration_and_no_checkpoints)
    assert obs_project_tree_str == expected_project_tree_str
    # make sure config number incremented
    assert (DataContext.get_ge_config_version(
        context_root_dir=
        v20_project_directory_with_v30_configuration_and_no_checkpoints) == 3.0
            )

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/UpgradeHelperV13_upgrade_without_manual_steps_log.json",
            )) as f:
        expected_upgrade_log_dict: dict = json.load(f)
        expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict)
        expected_upgrade_log_str = expected_upgrade_log_str.replace(
            "GE_PROJECT_DIR",
            v20_project_directory_with_v30_configuration_and_no_checkpoints,
        )
        expected_upgrade_log_dict = json.loads(expected_upgrade_log_str)

    with open(
            f"{v20_project_directory_with_v30_configuration_and_no_checkpoints}/uncommitted/logs/project_upgrades/UpgradeHelperV13_20210119T132639.000000Z.json"
    ) as f:
        obs_upgrade_log_dict: dict = json.load(f)

    assert obs_upgrade_log_dict == expected_upgrade_log_dict
コード例 #16
0
def test_expectation_suite_filedata_asset():
    # Load in data files
    file_path = file_relative_path(__file__,
                                   "../test_sets/toy_data_complete.csv")

    # Create FileDataAsset objects
    f_dat = ge.data_asset.FileDataAsset(file_path)

    # Set up expectations
    f_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        skip=1,
        result_format="BASIC",
        catch_exceptions=True,
    )

    f_dat.expect_file_line_regex_match_count_to_be_between(
        regex=r",\S",
        expected_max_count=2,
        skip=1,
        result_format="SUMMARY",
        include_config=True,
    )

    # Test basic config output
    complete_config = f_dat.get_expectation_suite()
    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs=ExpectationKwargs(expected_count=3, regex=",\\S", skip=1),
        )
    ] == complete_config.expectations

    # Include result format kwargs
    complete_config2 = f_dat.get_expectation_suite(
        discard_result_format_kwargs=False, discard_failed_expectations=False)
    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs={
                "expected_count": 3,
                "regex": ",\\S",
                "result_format": "BASIC",
                "skip": 1,
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_be_between",
            kwargs={
                "expected_max_count": 2,
                "regex": ",\\S",
                "result_format": "SUMMARY",
                "skip": 1,
            },
        ),
    ] == complete_config2.expectations

    # Discard Failing Expectations
    complete_config3 = f_dat.get_expectation_suite(
        discard_result_format_kwargs=False, discard_failed_expectations=True)

    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs={
                "expected_count": 3,
                "regex": ",\\S",
                "result_format": "BASIC",
                "skip": 1,
            },
        )
    ] == complete_config3.expectations
コード例 #17
0
def _load_script_template() -> str:
    with open(file_relative_path(__file__,
                                 "checkpoint_script_template.py")) as f:
        template = f.read()
    return template
コード例 #18
0
def test_pandas_source_read_csv(
    data_context_parameterized_expectation_suite, tmp_path_factory
):
    basedir = tmp_path_factory.mktemp("test_create_pandas_datasource")
    shutil.copy(file_relative_path(__file__, "../test_sets/unicode.csv"), basedir)
    data_context_parameterized_expectation_suite.add_datasource(
        "mysource",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        reader_options={"encoding": "utf-8"},
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(basedir),
            }
        },
    )

    data_context_parameterized_expectation_suite.create_expectation_suite(
        expectation_suite_name="unicode"
    )
    batch = data_context_parameterized_expectation_suite.get_batch(
        data_context_parameterized_expectation_suite.build_batch_kwargs(
            "mysource", "subdir_reader", "unicode"
        ),
        "unicode",
    )
    assert len(batch["Μ"] == 1)
    assert "😁" in list(batch["Μ"])

    data_context_parameterized_expectation_suite.add_datasource(
        "mysource2",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(basedir),
            }
        },
    )

    batch = data_context_parameterized_expectation_suite.get_batch(
        data_context_parameterized_expectation_suite.build_batch_kwargs(
            "mysource2", "subdir_reader", "unicode"
        ),
        "unicode",
    )
    assert "😁" in list(batch["Μ"])

    data_context_parameterized_expectation_suite.add_datasource(
        "mysource3",
        module_name="great_expectations.datasource",
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": str(basedir),
                "reader_options": {"encoding": "utf-16"},
            }
        },
    )

    with pytest.raises(UnicodeError, match="UTF-16 stream does not start with BOM"):
        batch = data_context_parameterized_expectation_suite.get_batch(
            data_context_parameterized_expectation_suite.build_batch_kwargs(
                "mysource3", "subdir_reader", "unicode"
            ),
            "unicode",
        )

    with pytest.raises(LookupError, match="unknown encoding: blarg"):
        batch_kwargs = data_context_parameterized_expectation_suite.build_batch_kwargs(
            "mysource3", "subdir_reader", "unicode"
        )
        batch_kwargs.update({"reader_options": {"encoding": "blarg"}})
        batch = data_context_parameterized_expectation_suite.get_batch(
            batch_kwargs=batch_kwargs, expectation_suite_name="unicode"
        )

    with pytest.raises(LookupError, match="unknown encoding: blarg"):
        batch = data_context_parameterized_expectation_suite.get_batch(
            expectation_suite_name="unicode",
            batch_kwargs=data_context_parameterized_expectation_suite.build_batch_kwargs(
                "mysource",
                "subdir_reader",
                "unicode",
                reader_options={"encoding": "blarg"},
            ),
        )

    batch = data_context_parameterized_expectation_suite.get_batch(
        batch_kwargs=data_context_parameterized_expectation_suite.build_batch_kwargs(
            "mysource2",
            "subdir_reader",
            "unicode",
            reader_options={"encoding": "utf-8"},
        ),
        expectation_suite_name="unicode",
    )
    assert "😁" in list(batch["Μ"])
コード例 #19
0
def copy_relative_path(relative_src, dest):
    shutil.copy(file_relative_path(__file__, relative_src), dest)
コード例 #20
0
def test_expect_file_line_regex_match_count_to_be_between():

    #####Invlaid File Path######
    joke_file_path = "joke.txt"
    assert not os.path.isfile(joke_file_path)
    joke_dat = ge.data_asset.FileDataAsset(joke_file_path)

    with pytest.raises(IOError):
        joke_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=0, expected_max_count=4, skip=1
        )

    complete_file_path = file_relative_path(
        __file__, "../test_sets/toy_data_complete.csv"
    )
    file_dat = ge.data_asset.FileDataAsset(complete_file_path)

    # Invalid Skip Parameter
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=0, expected_max_count=4, skip=2.4
        )

    # Invalid Regex
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=2, expected_min_count=1, expected_max_count=8, skip=2
        )

    # Non-integer min value
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=1.3, expected_max_count=8, skip=1
        )

    # Negative min value
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=-2, expected_max_count=8, skip=1
        )

    # Non-integer max value
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=0, expected_max_count="foo", skip=1
        )

    # Negative max value
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=0, expected_max_count=-1, skip=1
        )

    # Min count more than max count
    with pytest.raises(ValueError):
        file_dat.expect_file_line_regex_match_count_to_be_between(
            regex=r",\S", expected_min_count=4, expected_max_count=3, skip=1
        )

    # Count does not fall in range
    fail_trial = file_dat.expect_file_line_regex_match_count_to_be_between(
        regex=r",\S", expected_min_count=9, expected_max_count=12, skip=1
    )

    assert not fail_trial.success
    assert fail_trial.result["unexpected_percent"] == 100
    assert fail_trial.result["missing_percent"] == 0

    # Count does fall in range
    success_trial = file_dat.expect_file_line_regex_match_count_to_be_between(
        regex=r",\S", expected_min_count=0, expected_max_count=4, skip=1
    )

    assert success_trial.success
    assert success_trial.result["unexpected_percent"] == 0
    assert success_trial.result["missing_percent"] == 0
コード例 #21
0
def test_profiler_all_expectation_types(titanic_data_context,
                                        possible_expectations_set):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    context = titanic_data_context
    df = ge.read_csv(
        file_relative_path(
            __file__,
            "../test_sets/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01.csv",
        ))
    batch_df = ge.dataset.PandasDataset(df)

    ignored_columns = [
        "pickup_location_id",
        "dropoff_location_id",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
    semantic_types = {
        "datetime": ["pickup_datetime", "dropoff_datetime"],
        "numeric": ["total_amount", "passenger_count"],
        "value_set": [
            "payment_type",
            "rate_code_id",
            "store_and_fwd_flag",
            "passenger_count",
        ],
        "boolean": ["store_and_fwd_flag"],
    }

    profiler = UserConfigurableProfiler(
        batch_df,
        semantic_types_dict=semantic_types,
        ignored_columns=ignored_columns,
        primary_or_compound_key=[
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "trip_distance",
            "pickup_location_id",
            "dropoff_location_id",
        ],
    )

    assert profiler.column_info.get("rate_code_id")
    suite = profiler.build_suite()
    assert len(suite.expectations) == 46
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite)

    unexpected_expectations = {
        "expect_column_values_to_be_unique",
        "expect_column_values_to_be_null",
    }
    assert expectations_from_suite == {
        i
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator("action_list_operator",
                                              assets_to_validate=[batch_df])

    assert results["success"]
コード例 #22
0
    def test_validate_distribution_parameters(self):
        D = ge.read_csv(
            file_relative_path(
                __file__,
                "../test_sets/fixed_distributional_test_dataset.csv"))

        # ------ p_value ------
        with self.assertRaises(ValueError):
            # p_value is 0
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 1], p_value=0)
        with self.assertRaises(ValueError):
            # p_value negative
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 1], p_value=-0.1)
        with self.assertRaises(ValueError):
            P_value = 1
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 1], p_value=1)

        with self.assertRaises(ValueError):
            # p_value greater than 1
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 1], p_value=1.1)
        with self.assertRaises(ValueError):
            # params is none
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=None)

        # ---- std_dev ------
        with self.assertRaises(ValueError):
            # std_dev is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params={
                    "mean": 0,
                    "std_dev": 0
                })
        with self.assertRaises(ValueError):
            # std_dev is negative, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params={
                    "mean": 0,
                    "std_dev": -1
                })
        with self.assertRaises(ValueError):
            # std_dev is 0, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 0])
        with self.assertRaises(ValueError):
            # std_dev is negative, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, -1])

        # ------- beta ------
        with self.assertRaises(ValueError):
            # beta, alpha is 0, dict params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={
                    "alpha": 0,
                    "beta": 0.1
                })
        with self.assertRaises(ValueError):
            # beta, alpha is negative, dict params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={
                    "alpha": -1,
                    "beta": 0.1
                })
        with self.assertRaises(ValueError):
            # beta, beta is 0, dict params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={
                    "alpha": 0.1,
                    "beta": 0
                })
        with self.assertRaises(ValueError):
            # beta, beta is negative, dict params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={
                    "alpha": 0,
                    "beta": -1
                })
        with self.assertRaises(ValueError):
            # beta, alpha is 0, list params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[0, 0.1])
        with self.assertRaises(ValueError):
            # beta, alpha is negative, list params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[-1, 0.1])
        with self.assertRaises(ValueError):
            # beta, beta is 0, list params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[0.1, 0])
        with self.assertRaises(ValueError):
            # beta, beta is negative, list params
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[0.1, -1])

        with self.assertRaises(ValueError):
            # beta, missing alpha, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={"beta": 0.1})
        with self.assertRaises(ValueError):
            # beta, missing beta, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params={"alpha": 0.1})
        with self.assertRaises(ValueError):
            # beta, missing beta, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[1])
        with self.assertRaises(ValueError):
            # beta, missing beta, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "beta", distribution="beta", params=[1, 1, 1, 1, 1])

        # ------ Gamma -------
        with self.assertRaises(ValueError):
            # gamma, alpha is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params={"alpha": 0})
        with self.assertRaises(ValueError):
            # gamma, alpha is negative, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params={"alpha": -1})
        with self.assertRaises(ValueError):
            # gamma, alpha is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params={"alpha": 0})
        with self.assertRaises(ValueError):
            # gamma, alpha is missing, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params={})
        with self.assertRaises(ValueError):
            # gamma, alpha is missing, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params=[])
        with self.assertRaises(ValueError):
            # gamma, alpha is 0, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params=[0])
        with self.assertRaises(ValueError):
            # gamma, alpha is negative, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params=[-1])
        with self.assertRaises(ValueError):
            # gamma, too many arguments, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "gamma", distribution="gamma", params=[1, 1, 1, 1])

        # ----- chi2 --------
        with self.assertRaises(ValueError):
            # chi2, df is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params={"df": 0})
        with self.assertRaises(ValueError):
            # chi2, df is negative, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params={"df": -1})
        with self.assertRaises(ValueError):
            # chi2, df is missing, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params={})
        with self.assertRaises(ValueError):
            # chi2, df is 0, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params=[0])
        with self.assertRaises(ValueError):
            # chi2, df is negative, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params=[-1])
        with self.assertRaises(ValueError):
            # chi2, df is missing, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params=[])
        with self.assertRaises(ValueError):
            # chi2, too many parameters, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "chi2", distribution="chi2", params=[1, 1, 1, 5])
        # ----- norm ------
        with self.assertRaises(ValueError):
            # norm, too many arguments, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "norm", distribution="norm", params=[0, 1, 500])

        # ----- uniform -----
        with self.assertRaises(ValueError):
            # uniform, scale is 0, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "uniform", distribution="uniform", params=[0, 0])
        with self.assertRaises(ValueError):
            # uniform, scale is negative, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "uniform", distribution="uniform", params=[0, -1])
        with self.assertRaises(ValueError):
            # uniform, scale is negative, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "uniform",
                distribution="uniform",
                params={
                    "loc": 0,
                    "scale": -1
                })
        with self.assertRaises(ValueError):
            # uniform, scale is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "uniform",
                distribution="uniform",
                params={
                    "loc": 0,
                    "scale": 0
                })

        with self.assertRaises(ValueError):
            # uniform, too many parameters, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "uniform", distribution="uniform", params=[0, 1, 500])

        # --- expon ---
        with self.assertRaises(ValueError):
            # expon, scale is 0, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential", distribution="expon", params=[0, 0])
        with self.assertRaises(ValueError):
            # expon, scale is negative, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential", distribution="expon", params=[0, -1])
        with self.assertRaises(ValueError):
            # expon, scale is 0, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential",
                distribution="expon",
                params={
                    "loc": 0,
                    "scale": 0
                })
        with self.assertRaises(ValueError):
            # expon, scale is negative, dict
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential",
                distribution="expon",
                params={
                    "loc": 0,
                    "scale": -1
                })
        with self.assertRaises(ValueError):
            # expon, too many parameters, list
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential", distribution="expon", params=[0, 1, 500])

        # --- misc ---
        with self.assertRaises(AttributeError):
            # non-supported distribution
            D.expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
                "exponential", distribution="fakedistribution", params=[0, 1])
コード例 #23
0
def test_get_batch_list_from_new_style_datasource_with_file_system_datasource_configured_assets(
        empty_data_context, tmp_path_factory):
    context = empty_data_context

    base_directory = str(
        tmp_path_factory.mktemp(
            "test_get_batch_list_from_new_style_datasource_with_file_system_datasource_configured_assets"
        ))

    titanic_asset_base_directory_path: str = os.path.join(
        base_directory, "data")
    os.makedirs(titanic_asset_base_directory_path)

    titanic_csv_source_file_path: str = file_relative_path(
        __file__, "../test_sets/Titanic.csv")
    titanic_csv_destination_file_path: str = str(
        os.path.join(base_directory, "data/Titanic_19120414_1313.csv"))
    shutil.copy(titanic_csv_source_file_path,
                titanic_csv_destination_file_path)

    config = yaml.load(
        f"""
class_name: Datasource

execution_engine:
    class_name: PandasExecutionEngine

data_connectors:
    my_data_connector:
        class_name: ConfiguredAssetFilesystemDataConnector
        base_directory: {base_directory}
        glob_directive: "*.csv"

        default_regex:
            pattern: (.+)\\.csv
            group_names:
                - name
        assets:
            Titanic:
                base_directory: {titanic_asset_base_directory_path}
                pattern: (.+)_(\\d+)_(\\d+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - size
    """, )

    context.add_datasource(
        "my_datasource",
        **config,
    )

    batch_request: Union[dict, BatchRequest] = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_data_connector",
        "data_asset_name": "Titanic",
        "partition_request": {
            "batch_identifiers": {
                "name": "Titanic",
                "timestamp": "19120414",
                "size": "1313",
            }
        },
    }
    batch_list: List[Batch] = context.get_batch_list(**batch_request)

    assert len(batch_list) == 1

    batch: Batch = batch_list[0]
    assert batch.batch_spec is not None
    assert batch.batch_definition["data_asset_name"] == "Titanic"
    assert batch.batch_definition["partition_definition"] == {
        "name": "Titanic",
        "timestamp": "19120414",
        "size": "1313",
    }
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.data.dataframe.shape == (1313, 7)
コード例 #24
0
    def test_infer_distribution_parameters(self):
        D = ge.read_csv(
            file_relative_path(
                __file__,
                "../test_sets/fixed_distributional_test_dataset.csv"))

        with self.assertRaises(TypeError):
            ge.dataset.util.infer_distribution_parameters(
                data=D.norm,
                distribution="norm",
                params=["wrong_param_format"])
        t = ge.dataset.util.infer_distribution_parameters(data=D.norm_std,
                                                          distribution="norm",
                                                          params=None)
        self.assertEqual(t["mean"], D.norm_std.mean())
        self.assertEqual(t["std_dev"], D.norm_std.std())
        self.assertEqual(t["loc"], 0)
        self.assertEqual(t["scale"], 1)

        # beta
        t = ge.dataset.util.infer_distribution_parameters(data=D.beta,
                                                          distribution="beta")
        self.assertEqual(
            t["alpha"],
            (t["mean"]**2) * (((1 - t["mean"]) / t["std_dev"]**2) -
                              (1 / t["mean"])),
            "beta dist, alpha infer",
        )
        self.assertEqual(t["beta"], t["alpha"] * ((1 / t["mean"]) - 1),
                         "beta dist, beta infer")

        # gamma
        t = ge.dataset.util.infer_distribution_parameters(data=D.gamma,
                                                          distribution="gamma")
        self.assertEqual(t["alpha"], D.gamma.mean())

        # uniform distributions
        t = ge.dataset.util.infer_distribution_parameters(
            data=D.uniform, distribution="uniform")
        self.assertEqual(t["min"], min(D.uniform), "uniform, min infer")
        self.assertEqual(t["max"],
                         max(D.uniform) - min(D.uniform), "uniform, max infer")

        uni_loc = 5
        uni_scale = 10
        t = ge.dataset.util.infer_distribution_parameters(
            data=D.uniform,
            distribution="uniform",
            params={
                "loc": uni_loc,
                "scale": uni_scale
            },
        )
        self.assertEqual(t["min"], uni_loc, "uniform, min infer")
        self.assertEqual(t["max"], uni_scale, "uniform, max infer")

        # expon distribution
        with self.assertRaises(AttributeError):
            ge.dataset.util.infer_distribution_parameters(
                data=D.norm, distribution="fakedistribution")

        # chi2
        t = ge.dataset.util.infer_distribution_parameters(data=D.chi2,
                                                          distribution="chi2")
        self.assertEqual(t["df"], D.chi2.mean())
コード例 #25
0
def test_DataContext_raises_error_on_unparsable_yaml_file():
    local_dir = file_relative_path(__file__, os.path.join(BASE_DIR, "bad_yml"))
    with pytest.raises(ge_exceptions.InvalidConfigurationYamlError):
        DataContext(local_dir)
コード例 #26
0
def test_basic_project_upgrade(v10_project_directory, caplog):
    # test project upgrade that requires no manual steps

    runner: CliRunner = CliRunner(mix_stderr=False)
    result: Result = runner.invoke(
        cli,
        ["-c", v10_project_directory, "--v3-api", "project", "upgrade"],
        input="\n",
        catch_exceptions=False,
    )
    stdout: str = result.stdout

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/test_basic_project_upgrade_expected_stdout.fixture",
            )) as f:
        expected_stdout: str = f.read()
        expected_stdout = expected_stdout.replace("GE_PROJECT_DIR",
                                                  v10_project_directory)
        assert stdout == expected_stdout

    expected_project_tree_str: str = """\
great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
        .gitkeep
    expectations/
        .ge_store_backend_id
        .gitkeep
    notebooks/
        .gitkeep
    plugins/
        custom_store_backends/
            __init__.py
            my_custom_store_backend.py
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                expectations/
                    .gitkeep
                static/
                    .gitkeep
                validations/
                    diabetic_data/
                        warning/
                            20200430T191246.763896Z/
                                20200430T191246.763896Z/
                                    c3b4c5df224fef4b1a056a0f3b93aba5.html
        logs/
            project_upgrades/
                UpgradeHelperV11_20190926T134241.000000Z.json
                UpgradeHelperV13_20190926T134241.000000Z.json
        validations/
            .ge_store_backend_id
            diabetic_data/
                warning/
                    20200430T191246.763896Z/
                        20200430T191246.763896Z/
                            c3b4c5df224fef4b1a056a0f3b93aba5.json
"""
    obs_project_tree_str: str = gen_directory_tree_str(
        startpath=v10_project_directory)
    assert obs_project_tree_str == expected_project_tree_str
    # make sure config number incremented
    assert (DataContext.get_ge_config_version(
        context_root_dir=v10_project_directory) == 3.0)

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/UpgradeHelperV11_basic_upgrade_log.json",
            )) as f:
        expected_upgrade_log_dict: dict = json.load(f)
        expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict)
        expected_upgrade_log_str = expected_upgrade_log_str.replace(
            "GE_PROJECT_DIR", v10_project_directory)
        expected_upgrade_log_dict: dict = json.loads(expected_upgrade_log_str)

    with open(
            f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z.json"
    ) as f:
        obs_upgrade_log_dict: dict = json.load(f)

    assert obs_upgrade_log_dict == expected_upgrade_log_dict
コード例 #27
0
def test_cli_init_on_new_project_with_broken_excel_file_try_again_with_different_file(
    mock_webbrowser, caplog, tmp_path_factory
):
    project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff"))
    os.makedirs(os.path.join(project_dir, "data"))
    data_path = os.path.join(project_dir, "data", "broken_excel_file.xls")
    fixture_path = file_relative_path(__file__, "../test_sets/broken_excel_file.xls")
    data_path_2 = os.path.join(project_dir, "data", "Titanic.csv")
    fixture_path_2 = file_relative_path(__file__, "../test_sets/Titanic.csv")
    shutil.copy(fixture_path, data_path)
    shutil.copy(fixture_path_2, data_path_2)

    runner = CliRunner(mix_stderr=False)
    result = runner.invoke(
        cli,
        ["init", "-d", project_dir],
        input="Y\n1\n1\n{}\n\n{}\n".format(
            data_path, data_path_2, catch_exceptions=False
        ),
    )
    stdout = result.output
    assert mock_webbrowser.call_count == 1
    assert (
        "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/".format(
            project_dir
        )
        in mock_webbrowser.call_args[0][0]
    )

    assert len(stdout) < 3000, "CLI output is unreasonably long."
    assert "Always know what to expect from your data" in stdout
    assert "What data would you like Great Expectations to connect to" in stdout
    assert "What are you processing your files with" in stdout
    assert "Enter the path (relative or absolute) of a data file" in stdout
    assert "Cannot load file." in stdout
    assert (
        "- Please check the file and try again or select a different data file."
        in stdout
    )
    assert (
        "- Error: Unsupported format, or corrupt file: Expected BOF record; found b'PRODUCTI'"
        in stdout
    )
    assert "Try again? [Y/n]:" in stdout
    assert "[{}]:".format(data_path) in stdout

    assert "Name the new expectation suite [Titanic.warning]" in stdout
    assert (
        "Great Expectations will choose a couple of columns and generate expectations about them"
        in stdout
    )
    assert "Generating example Expectation Suite..." in stdout
    assert "Building" in stdout
    assert "Data Docs" in stdout
    assert (
        "A new Expectation suite 'Titanic.warning' was added to your project" in stdout
    )
    assert "Great Expectations is now set up" in stdout

    assert os.path.isdir(os.path.join(project_dir, "great_expectations"))
    config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml")
    assert os.path.isfile(config_path)

    config = yaml.load(open(config_path, "r"))
    data_source_class = config["datasources"]["files_datasource"]["data_asset_type"][
        "class_name"
    ]
    assert data_source_class == "PandasDataset"

    obs_tree = gen_directory_tree_str(os.path.join(project_dir, "great_expectations"))

    # Instead of monkey patching datetime, just regex out the time directories
    date_safe_obs_tree = re.sub(r"\d*T\d*\.\d*Z", "9999.9999", obs_tree)
    # Instead of monkey patching guids, just regex out the guids
    guid_safe_obs_tree = re.sub(
        r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", date_safe_obs_tree
    )
    assert (
        guid_safe_obs_tree
        == """great_expectations/
    .gitignore
    great_expectations.yml
    expectations/
        Titanic/
            warning.json
    notebooks/
        pandas/
            validation_playground.ipynb
        spark/
            validation_playground.ipynb
        sql/
            validation_playground.ipynb
    plugins/
        custom_data_docs/
            renderers/
            styles/
                data_docs_custom_styles.css
            views/
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                index.html
                expectations/
                    Titanic/
                        warning.html
                static/
                    fonts/
                        HKGrotesk/
                            HKGrotesk-Bold.otf
                            HKGrotesk-BoldItalic.otf
                            HKGrotesk-Italic.otf
                            HKGrotesk-Light.otf
                            HKGrotesk-LightItalic.otf
                            HKGrotesk-Medium.otf
                            HKGrotesk-MediumItalic.otf
                            HKGrotesk-Regular.otf
                            HKGrotesk-SemiBold.otf
                            HKGrotesk-SemiBoldItalic.otf
                    images/
                        favicon.ico
                        glossary_scroller.gif
                        iterative-dev-loop.png
                        logo-long-vector.svg
                        logo-long.png
                        short-logo-vector.svg
                        short-logo.png
                        validation_failed_unexpected_values.gif
                    styles/
                        data_docs_custom_styles_template.css
                        data_docs_default_styles.css
                validations/
                    Titanic/
                        warning/
                            9999.9999/
                                foobarbazguid.html
        validations/
            Titanic/
                warning/
                    9999.9999/
                        foobarbazguid.json
"""
    )

    assert_no_logging_messages_or_tracebacks(caplog, result)
コード例 #28
0
def test_project_upgrade_with_manual_steps(v10_project_directory, caplog, sa,
                                           postgresql_engine):
    # This test requires sqlalchemy because it includes database backends configured
    # test project upgrade that requires manual steps

    # copy v2 yml
    shutil.copy(
        file_relative_path(
            __file__,
            "../../test_fixtures/upgrade_helper/great_expectations_v1_needs_manual_upgrade.yml",
        ),
        os.path.join(v10_project_directory, "great_expectations.yml"),
    )

    runner: CliRunner = CliRunner(mix_stderr=False)
    result: Result = runner.invoke(
        cli,
        ["-c", v10_project_directory, "--v3-api", "project", "upgrade"],
        input="\n",
        catch_exceptions=False,
    )
    stdout: str = result.stdout

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/test_project_upgrade_with_manual_steps_expected_stdout.fixture",
            )) as f:
        expected_stdout: str = f.read()
        expected_stdout = expected_stdout.replace("GE_PROJECT_DIR",
                                                  v10_project_directory)
        assert stdout == expected_stdout

    pycache_dir_path: str = os.path.join(v10_project_directory, "plugins",
                                         "custom_store_backends",
                                         "__pycache__")
    try:
        shutil.rmtree(pycache_dir_path)
    except FileNotFoundError:
        pass

    expected_project_tree_str: str = """\
great_expectations/
    .gitignore
    great_expectations.yml
    checkpoints/
        .gitkeep
    expectations/
        .ge_store_backend_id
        .gitkeep
    notebooks/
        .gitkeep
    plugins/
        custom_store_backends/
            __init__.py
            my_custom_store_backend.py
    uncommitted/
        config_variables.yml
        data_docs/
            local_site/
                expectations/
                    .gitkeep
                static/
                    .gitkeep
                validations/
                    diabetic_data/
                        warning/
                            20200430T191246.763896Z/
                                20200430T191246.763896Z/
                                    c3b4c5df224fef4b1a056a0f3b93aba5.html
        logs/
            project_upgrades/
                UpgradeHelperV11_20190926T134241.000000Z.json
        validations/
            .ge_store_backend_id
            diabetic_data/
                warning/
                    20200430T191246.763896Z/
                        20200430T191246.763896Z/
                            c3b4c5df224fef4b1a056a0f3b93aba5.json
"""
    obs_project_tree_str: str = gen_directory_tree_str(
        startpath=v10_project_directory)
    assert obs_project_tree_str == expected_project_tree_str
    # make sure config number not incremented
    assert (DataContext.get_ge_config_version(
        context_root_dir=v10_project_directory) == 1.0)

    with open(
            file_relative_path(
                __file__,
                "../../test_fixtures/upgrade_helper/UpgradeHelperV11_manual_steps_upgrade_log.json",
            )) as f:
        expected_upgrade_log_dict: dict = json.load(f)
        expected_upgrade_log_str: str = json.dumps(expected_upgrade_log_dict)
        expected_upgrade_log_str = expected_upgrade_log_str.replace(
            "GE_PROJECT_DIR", v10_project_directory)
        expected_upgrade_log_dict = json.loads(expected_upgrade_log_str)

    with open(
            f"{v10_project_directory}/uncommitted/logs/project_upgrades/UpgradeHelperV11_20190926T134241.000000Z.json"
    ) as f:
        obs_upgrade_log_dict: dict = json.load(f)

    assert obs_upgrade_log_dict == expected_upgrade_log_dict
コード例 #29
0
    def copy_static_assets(self, static_assets_source_dir=None):
        """
        Copies static assets, using a special "static_assets" backend store that accepts variable-length tuples as
        keys, with no filepath_template.
        """
        file_exclusions = [".DS_Store"]
        dir_exclusions = []

        if not static_assets_source_dir:
            static_assets_source_dir = file_relative_path(
                __file__, os.path.join("..", "..", "render", "view", "static"))

        # If `static_assets_source_absdir` contains the string ".zip", then we try to extract (unzip)
        # the static files. If the unzipping is successful, that means that Great Expectations is
        # installed into a zip file (see PEP 273) and we need to run this function again
        if ".zip" in static_assets_source_dir.lower():
            unzip_destdir = tempfile.mkdtemp()
            unzipped_ok = self._unzip_assets(static_assets_source_dir,
                                             unzip_destdir)
            if unzipped_ok:
                return self.copy_static_assets(unzip_destdir)

        for item in os.listdir(static_assets_source_dir):
            # Directory
            if os.path.isdir(os.path.join(static_assets_source_dir, item)):
                if item in dir_exclusions:
                    continue
                # Recurse
                new_source_dir = os.path.join(static_assets_source_dir, item)
                self.copy_static_assets(new_source_dir)
            # File
            else:
                # Copy file over using static assets store backend
                if item in file_exclusions:
                    continue
                source_name = os.path.join(static_assets_source_dir, item)
                with open(source_name, "rb") as f:
                    # Only use path elements starting from static/ for key
                    store_key = tuple(
                        os.path.normpath(source_name).split(os.sep))
                    store_key = store_key[store_key.index("static"):]
                    content_type, content_encoding = guess_type(item,
                                                                strict=False)

                    if content_type is None:
                        # Use GE-known content-type if possible
                        if source_name.endswith(".otf"):
                            content_type = "font/opentype"
                        else:
                            # fallback
                            logger.warning(
                                "Unable to automatically determine content_type for {}"
                                .format(source_name))
                            content_type = "text/html; charset=utf8"

                    if not isinstance(self.store_backends["static_assets"],
                                      GeCloudStoreBackend):
                        self.store_backends["static_assets"].set(
                            store_key,
                            f.read(),
                            content_encoding=content_encoding,
                            content_type=content_type,
                        )
コード例 #30
0
def test_requirements_files():
    """requirements.txt should be a subset of requirements-dev.txt"""

    with open(file_relative_path(__file__, "../requirements.txt")) as req:
        requirements = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev.txt")) as req:
        requirements_dev = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev-util.txt")) as req:
        requirements_dev_util = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev-spark.txt")) as req:
        requirements_dev_spark = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(
        file_relative_path(__file__, "../requirements-dev-sqlalchemy.txt")
    ) as req:
        requirements_dev_sqlalchemy = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev-test.txt")) as req:
        requirements_dev_test = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev-build.txt")) as req:
        requirements_dev_build = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    with open(file_relative_path(__file__, "../requirements-dev-publish.txt")) as req:
        requirements_dev_publish = {
            f'{line.name}{"".join(line.specs[0])}' for line in rp.parse(req)
        }

    assert requirements <= requirements_dev

    assert requirements_dev_util.intersection(requirements_dev_spark) == set()
    assert requirements_dev_util.intersection(requirements_dev_sqlalchemy) == set()
    assert requirements_dev_util.intersection(requirements_dev_test) == set()
    assert requirements_dev_util.intersection(requirements_dev_build) == set()

    assert requirements_dev_spark.intersection(requirements_dev_sqlalchemy) == set()
    assert requirements_dev_spark.intersection(requirements_dev_test) == set()
    assert requirements_dev_spark.intersection(requirements_dev_build) == set()

    assert requirements_dev_sqlalchemy.intersection(requirements_dev_test) == set()
    assert requirements_dev_sqlalchemy.intersection(requirements_dev_build) == set()

    assert requirements_dev_test.intersection(requirements_dev_build) == set()

    assert requirements_dev_publish.intersection(requirements_dev_test) == set()
    assert requirements_dev_publish.intersection(requirements_dev_build) == set()

    assert (
        requirements_dev
        - (
            requirements
            | requirements_dev_util
            | requirements_dev_sqlalchemy
            | requirements_dev_spark
            | requirements_dev_test
            | requirements_dev_build
            | requirements_dev_publish
        )
        == set()
    )