Ejemplo n.º 1
0
def library_install_load_check(
    python_import_name: str, pip_library_name: str
) -> Union[int, None]:
    """
    Dynamically load a module from strings, attempt a pip install or raise a helpful error.

    :return: True if the library was loaded successfully, False otherwise

    Args:
        pip_library_name: name of the library to load
        python_import_name (str): a module to import to verify installation
    """
    if is_library_loadable(library_name=python_import_name):
        return None

    confirm_prompt: str = f"""Great Expectations relies on the library `{python_import_name}` to connect to your data, \
but the package `{pip_library_name}` containing this library is not installed.
    Would you like Great Expectations to try to execute `pip install {pip_library_name}` for you?"""
    continuation_message: str = f"""\nOK, exiting now.
    - Please execute `pip install {pip_library_name}` before trying again."""
    pip_install_confirmed = toolkit.confirm_proceed_or_exit(
        confirm_prompt=confirm_prompt,
        continuation_message=continuation_message,
        exit_on_no=True,
        exit_code=1,
    )

    if not pip_install_confirmed:
        cli_message(continuation_message)
        sys.exit(1)

    status_code: int = execute_shell_command_with_progress_polling(
        f"pip install {pip_library_name}"
    )

    # project_distribution: Distribution = get_project_distribution()
    # if project_distribution:
    #     project_name: str = project_distribution.metadata['Name']
    #     version: str = project_distribution.metadata['Version']
    #
    # pkg_resources.working_set = pkg_resources.WorkingSet._build_master()

    working_set: WorkingSet = pkg_resources.working_set
    # noinspection SpellCheckingInspection
    distr: Distribution = pkg_resources.get_distribution(dist=pip_library_name)
    pkg_resources.WorkingSet.add_entry(self=working_set, entry=distr.key)

    library_loadable: bool = is_library_loadable(library_name=python_import_name)

    if status_code == 0 and library_loadable:
        return 0

    if not library_loadable:
        cli_message(
            f"""<red>ERROR: Great Expectations relies on the library `{pip_library_name}` to connect to your data.</red>
        - Please execute `pip install {pip_library_name}` before trying again."""
        )
        return 1

    return status_code
Ejemplo n.º 2
0
        config_variables.yml
        data_docs/
        validations/
            .ge_store_backend_id
""")

    assert_no_logging_messages_or_tracebacks(my_caplog, result)


@pytest.mark.xfail(
    reason="This command is not yet implemented for the modern API",
    run=True,
    strict=True,
)
@pytest.mark.skipif(
    is_library_loadable(library_name="sqlalchemy"),
    reason="requires sqlalchemy to NOT be installed",
)
def test_init_install_sqlalchemy(caplog, tmp_path_factory, monkeypatch):
    """WARNING: THIS TEST IS AWFUL AND WE HATE IT."""
    # This test is as much about changing the entire test environment with side effects as it is about actually testing
    # the observed behavior.
    library_import_name = "sqlalchemy"
    library_name = "sqlalchemy"

    cli_input = "\n\n2\nn\n"

    basedir = tmp_path_factory.mktemp("test_cli_init_diff")

    runner = CliRunner(mix_stderr=False)
    monkeypatch.chdir(basedir)
Ejemplo n.º 3
0
class TestIO(unittest.TestCase):
    def test_read_csv(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_csv(
            script_path + "/test_sets/Titanic.csv",
        )

    def test_read_json(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_json(
            script_path + "/test_sets/test_json_data_file.json",
        )
        assert df["x"][0] == "i"
        assert isinstance(df, PandasDataset)
        assert sorted(list(df.keys())) == ["x", "y", "z"]

        df = ge.read_json(
            script_path + "/test_sets/nested_test_json_data_file.json",
            accessor_func=lambda x: x["data"],
        )
        assert df["x"][0] == "i"
        assert isinstance(df, PandasDataset)
        assert sorted(list(df.keys())) == ["x", "y", "z"]

    @pytest.mark.skipif(
        not is_library_loadable(library_name="openpyxl"),
        reason="GE uses pandas to read excel files, which requires openpyxl",
    )
    def test_read_excel(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_excel(
            script_path + "/test_sets/Titanic_multi_sheet.xlsx",
        )
        assert df["Name"][0] == "Allen, Miss Elisabeth Walton"
        assert isinstance(df, PandasDataset)

        # Note that pandas changed the parameter name from sheetname to sheet_name.
        # We will test with both options to ensure that the versions are correct.
        pandas_version = pd.__version__
        if re.match(r"0\.2[012]\.", pandas_version) is not None:
            dfs_dict = ge.read_excel(
                script_path + "/test_sets/Titanic_multi_sheet.xlsx", sheetname=None
            )

        else:
            dfs_dict = ge.read_excel(
                script_path + "/test_sets/Titanic_multi_sheet.xlsx", sheet_name=None
            )
        assert isinstance(dfs_dict, dict)
        assert list(dfs_dict.keys()) == ["Titanic_1", "Titanic_2", "Titanic_3"]
        assert isinstance(dfs_dict["Titanic_1"], PandasDataset)
        assert dfs_dict["Titanic_1"]["Name"][0] == "Allen, Miss Elisabeth Walton"

    def test_read_table(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_table(script_path + "/test_sets/Titanic.csv", sep=",")
        assert df["Name"][0] == "Allen, Miss Elisabeth Walton"
        assert isinstance(df, PandasDataset)

    def test_read_feather(self):
        pandas_version = re.match(r"(\d+)\.(\d+)\..+", pd.__version__)
        if pandas_version is None:
            raise ValueError("Unrecognized pandas version!")
        else:
            pandas_major_version = int(pandas_version.group(1))
            pandas_minor_version = int(pandas_version.group(2))
            if pandas_major_version == 0 and pandas_minor_version < 25:
                pytest.skip("Skipping because of old pandas version.")

        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_feather(script_path + "/test_sets/Titanic.feather")
        assert df["Name"][0] == "Allen, Miss Elisabeth Walton"
        assert isinstance(df, PandasDataset)

    def test_read_parquet(self):
        """
        This test is unusual, because on travis (but only on travis), we have observed problems importing pyarrow,
        which breaks this test (since it requires pyarrow available).

        The issue seems to be related to a binary compatibility issue with the installed/available version of numpy:
        pyarrow 0.10 requires numpy >= 1.14.

        Since pyarrow is not in our actual requirements, we are not going to adjust up the required numpy version.
        """

        # Pass this test if the available version of pandas is less than 0.21.0, because prior
        # versions of pandas did not include the read_parquet function.
        pandas_version = re.match(r"(\d+)\.(\d+)\..+", pd.__version__)
        if pandas_version is None:
            raise ValueError("Unrecognized pandas version!")
        else:
            pandas_major_version = int(pandas_version.group(1))
            pandas_minor_version = int(pandas_version.group(2))
            if pandas_major_version == 0 and pandas_minor_version < 23:
                pytest.skip("Pandas version < 23 is no longer compatible with pyarrow")

        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_parquet(script_path + "/test_sets/Titanic.parquet")
        assert df["Name"][1] == "Allen, Miss Elisabeth Walton"
        assert isinstance(df, PandasDataset)

    def test_read_pickle(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_pickle(
            script_path + "/test_sets/Titanic.pkl",
        )
        assert df["Name"][0] == "Allen, Miss Elisabeth Walton"
        assert isinstance(df, PandasDataset)
Ejemplo n.º 4
0
        for i in possible_expectations_set if i not in unexpected_expectations
    }

    ignored_included_columns_overlap = [
        i for i in columns_with_expectations if i in taxi_data_ignored_columns
    ]
    assert len(ignored_included_columns_overlap) == 0

    results = context.run_validation_operator(
        "action_list_operator", assets_to_validate=[taxi_validator_pandas])

    assert results["success"]


@pytest.mark.skipif(
    not is_library_loadable(library_name="pyspark"),
    reason="requires pyspark to be installed",
)
def test_profiler_all_expectation_types_spark(
    titanic_data_context_modular_api,
    taxi_validator_spark,
    possible_expectations_set,
    taxi_data_semantic_types,
    taxi_data_ignored_columns,
):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected for spark
    """
    context = titanic_data_context_modular_api
Ejemplo n.º 5
0
            UserWarning,
            match=
            r"Setting result format to COMPLETE for a SqlAlchemyDataset can be dangerous",
    ):
        unexpected_count_df.expect_column_values_to_be_in_set(
            "a",
            value_set=[1],
            result_format={
                "result_format": "COMPLETE",
                "partial_unexpected_count": 2
            },
        )


@pytest.mark.skipif(
    is_library_loadable(library_name="sqlalchemy_redshift"),
    reason="sqlalchemy_redshift must not be installed",
)
def test_dataset_attempt_allowing_relative_error_when_redshift_library_not_installed(
    sa, ):
    engine = sa.create_engine("sqlite://")
    dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1")

    assert isinstance(dataset, SqlAlchemyDataset)
    assert dataset.attempt_allowing_relative_error() is False


def test_expect_compound_columns_to_be_unique(sa):
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({
Ejemplo n.º 6
0
    assert (data_context_config["datasources"][name]["batch_kwargs_generators"]
            ["default"]["reader_options"]["sep"] == "|")

    # Note that pipe is special in yml, so let's also check to see that it was properly serialized
    with open(
            os.path.join(
                data_context_parameterized_expectation_suite.root_directory,
                "great_expectations.yml",
            ), ) as configfile:
        lines = configfile.readlines()
        assert "          sep: '|'\n" in lines
        assert "          header: false\n" in lines


@pytest.mark.skipif(
    not is_library_loadable(library_name="pyarrow")
    and not is_library_loadable(library_name="fastparquet"),
    reason="pyarrow and fastparquet are not installed",
)
def test_standalone_spark_parquet_datasource(
        test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource(
        "SparkParquet",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_parquet_folder_connection_path,
            }
        },
    )
    assert slack_action.run(
        validation_result_suite_identifier=validation_result_suite_id,
        validation_result_suite=ExpectationSuiteValidationResult(
            success=True,
            results=[],
            statistics={
                "successful_expectations": [],
                "evaluated_expectations": [],
            },
        ),
        data_asset=None,
    ) == {"slack_notification_result": "none required"}


@pytest.mark.skipif(
    not is_library_loadable(library_name="pypd"),
    reason="pypd is not installed",
)
@mock.patch("pypd.EventV2")
def test_PagerdutyAlertAction(
    data_context_parameterized_expectation_suite,
    validation_result_suite,
    validation_result_suite_id,
):
    api_key = "test"
    routing_key = "test"

    pagerduty_action = PagerdutyAlertAction(
        data_context=data_context_parameterized_expectation_suite,
        api_key=api_key,
        routing_key=routing_key,
    _ = SparkDFDataset(sdf, persist=True)
    sdf.persist.assert_called_once()

    sdf = spark_session.createDataFrame(df)
    sdf.persist = mock.MagicMock()
    _ = SparkDFDataset(sdf, persist=False)
    sdf.persist.assert_not_called()

    sdf = spark_session.createDataFrame(df)
    sdf.persist = mock.MagicMock()
    _ = SparkDFDataset(sdf)
    sdf.persist.assert_called_once()


@pytest.mark.skipif(
    not is_library_loadable(library_name="pyspark"),
    reason="pyspark must be installed",
)
@pytest.fixture
def test_dataframe(spark_session):
    from pyspark.sql.types import IntegerType, StringType, StructField, StructType

    schema = StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField(
            "address",
            StructType([
                StructField("street", StringType(), True),
                StructField("city", StringType(), True),
                StructField("house_number", IntegerType(), True),
Ejemplo n.º 9
0
import pandas as pd
import pytest

from great_expectations.core.expectation_validation_result import (
    ExpectationValidationResult, )
from great_expectations.self_check.util import (
    build_pandas_validator_with_data,
    build_sa_validator_with_data,
)
from great_expectations.util import is_library_loadable


@pytest.mark.skipif(
    not is_library_loadable(library_name="pyathena"),
    reason="pyathena is not installed",
)
def test_expect_column_values_to_be_in_type_list_dialect_pyathena_string(sa):
    from pyathena import sqlalchemy_athena

    df = pd.DataFrame({"col": ["test_val1", "test_val2"]})
    validator = build_sa_validator_with_data(df, "sqlite")

    # Monkey-patch dialect for testing purposes.
    validator.execution_engine.dialect_module = sqlalchemy_athena

    result = validator.expect_column_values_to_be_in_type_list(
        "col", type_list=["string", "boolean"])

    assert result == ExpectationValidationResult(
        success=True,
        expectation_config={
Ejemplo n.º 10
0
        "introspection": {
            "whole_table": {
                "data_asset_name_suffix": "__whole_table"
            }
        },
        "module_name": "great_expectations.datasource",
        "name": "my_datasource",
    }]
    obs = context.config_variables
    # remove the instance guid
    obs.pop("instance_id")
    assert obs == {}


@pytest.mark.skipif(
    not is_library_loadable(library_name="psycopg2"),
    reason="psycopg2 is not installed",
)
def test_sanitize_yaml_and_save_datasource_works_with_credentials(
    sa,
    empty_data_context,
):
    context = empty_data_context
    yaml_snippet = """
name: foo_datasource
class_name: SimpleSqlalchemyDatasource
credentials:
  host: localhost
  port: '5432'
  username: user
  password: pass