def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 5

    # Limit should also work
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == 1
    assert dataset.spark_df.count() == 2
Exemple #2
0
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session):
    assert spark_session  # Ensure a sparksession exists
    datasource = SparkDFDatasource('SparkParquet', generators={
    "subdir_reader": {
        "class_name": "SubdirReaderBatchKwargsGenerator",
        "base_directory": test_parquet_folder_connection_path
    }
}
)


    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet')
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 5

    # Limit should also work
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_parquet_folder_connection_path,
                                                            'test.parquet'),
                                       "limit": 2
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == 1
    assert batch.data.count() == 2
Exemple #3
0
def test_standalone_spark_csv_datasource(test_folder_connection_path_csv,
                                         test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "SparkParquet",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    assert datasource.get_available_data_asset_names(
    )["subdir_reader"]["names"] == [("test", "file")]
    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(test_folder_connection_path_csv, "test.csv"),
            "reader_options": {
                "header": True
            },
        })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()["col_1"] == "1"
Exemple #4
0
def test_spark_datasource_processes_dataset_options(
        test_folder_connection_path_csv, test_backends, empty_data_context):
    context: DataContext = empty_data_context
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader",
                                                 data_asset_name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = BridgeValidator(
        batch,
        ExpectationSuite(expectation_suite_name="foo", data_context=context))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    datasource = SparkDFDatasource('SparkParquet',
                                   base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": set(['test'])
    }
    dataset = datasource.get_batch('test', header=True)
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'
def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pyspark_skip = pytest.importorskip("pyspark")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource', base_directory=basepath)

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        })
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="blarg")
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
        }, reader_method="excel")
        assert "Unsupported reader: excel" in exc.value.message

    dataset = datasource.get_data_asset("idonotlooklikeacsvbutiam.notrecognized", batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
        },
        reader_method="csv", reader_options={'header': True})
    assert dataset.spark_df.head()["a"] == "1"
def test_invalid_reader_sparkdf_datasource(tmp_path_factory):
    pytest.importorskip("pyspark")
    basepath = str(
        tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource('mysparksource',
                                   batch_kwargs_generators={
                                       "subdir_reader": {
                                           "class_name":
                                           "SubdirReaderBatchKwargsGenerator",
                                           "base_directory": basepath
                                       }
                                   })

    with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
              "w") as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized")
            })
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "blarg"
            })
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path":
                os.path.join(basepath,
                             "idonotlooklikeacsvbutiam.notrecognized"),
                "reader_method":
                "excel"
            })
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path":
            os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method":
            "csv",
            "reader_options": {
                'header': True
            }
        })
    assert batch.data.head()["a"] == "1"
Exemple #8
0
def test_invalid_reader_sparkdf_datasource(tmp_path_factory, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    basepath = str(tmp_path_factory.mktemp("test_invalid_reader_sparkdf_datasource"))
    datasource = SparkDFDatasource(
        "mysparksource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": basepath,
            }
        },
    )

    with open(
        os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w"
    ) as newfile:
        newfile.write("a,b\n1,2\n3,4\n")

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized")
            }
        )
        assert "Unable to determine reader for path" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "blarg",
            }
        )
        assert "Unknown reader method: blarg" in exc.value.message

    with pytest.raises(BatchKwargsError) as exc:
        datasource.get_batch(
            batch_kwargs={
                "path": os.path.join(
                    basepath, "idonotlooklikeacsvbutiam.notrecognized"
                ),
                "reader_method": "excel",
            }
        )
        assert "Unknown reader: excel" in exc.value.message

    batch = datasource.get_batch(
        batch_kwargs={
            "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"),
            "reader_method": "csv",
            "reader_options": {"header": True},
        }
    )
    assert batch.data.head()["a"] == "1"
Exemple #9
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path):
    datasource = SparkDFDatasource('PandasCSV', generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
Exemple #10
0
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet,
                          ge_context: DataContext) -> str:
    from great_expectations.datasource import SparkDFDatasource

    path = str(dataset._filepath.parent)

    if path.startswith("./"):
        path = path[2:]

    configuration = SparkDFDatasource.build_configuration(
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": os.path.join("..", path),
            }
        })

    configuration["class_name"] = "SparkDFDatasource"
    errors = DatasourceConfigSchema().validate(configuration)
    if len(errors) != 0:
        raise ge_exceptions.GreatExpectationsError(
            "Invalid Datasource configuration: {0:s}".format(errors))

    ge_context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
Exemple #11
0
def _add_spark_datasource(context):
    path = click.prompt(
        msg_prompt_filesys_enter_base_path,
        # default='/data/',
        type=click.Path(exists=True,
                        file_okay=False,
                        dir_okay=True,
                        readable=True),
        show_default=True)
    if path.startswith("./"):
        path = path[2:]

    if path.endswith("/"):
        path = path[:-1]
    default_data_source_name = os.path.basename(path) + "__dir"
    data_source_name = click.prompt(msg_prompt_datasource_name,
                                    default=default_data_source_name,
                                    show_default=True)

    configuration = SparkDFDatasource.build_configuration(
        base_directory=os.path.join("..", path))
    context.add_datasource(name=data_source_name,
                           class_name='SparkDFDatasource',
                           **configuration)
    return data_source_name
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path)
    assert datasource.get_available_data_asset_names() == {
        "default": ['test']
    }
    dataset = datasource.get_batch('test',
                                   expectation_suite_name="default",
                                   batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv')
                                   },
                                   reader_options={"header": True})
    assert isinstance(dataset, SparkDFDataset)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert dataset.spark_df.head()['col_1'] == '1'
Exemple #13
0
def test_spark_config():
    source = SparkDFDatasource()
    conf = source.spark.sparkContext.getConf().getAll()
    # Without specifying any spark_config values we get defaults
    assert ("spark.app.name", "pyspark-shell") in conf

    source = SparkDFDatasource(spark_config={
        "spark.app.name": "great_expectations",
        "spark.sql.catalogImplementation": "hive",
        "spark.executor.memory": "128m"
    })

    # Test that our values were set
    conf = source.spark.sparkContext.getConf().getAll()
    assert ("spark.app.name", "great_expectations") in conf
    assert ("spark.sql.catalogImplementation", "hive") in conf
    assert ("spark.executor.memory", "128m") in conf
Exemple #14
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
Exemple #15
0
def test_standalone_spark_csv_datasource(test_folder_connection_path):
    pyspark_skip = pytest.importorskip("pyspark")
    datasource = SparkDFDatasource('SparkParquet',
                                   generators={"subdir_reader": {
                                        "class_name": "SubdirReaderBatchKwargsGenerator",
                                        "base_directory": test_folder_connection_path
                                        }
                                    }
    )

    assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')]
    batch = datasource.get_batch(batch_kwargs={
                                       "path": os.path.join(test_folder_connection_path,
                                                            'test.csv'),
                                       "reader_options": {"header": True}
                                   })
    assert isinstance(batch, Batch)
    # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int
    assert batch.data.head()['col_1'] == '1'
Exemple #16
0
def test_spark_config(test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    source = SparkDFDatasource()
    conf = source.spark.sparkContext.getConf().getAll()
    # Without specifying any spark_config values we get defaults
    assert ("spark.app.name", "pyspark-shell") in conf

    source = SparkDFDatasource(spark_config={
        "spark.app.name": "great_expectations",
        "spark.sql.catalogImplementation": "hive",
        "spark.executor.memory": "128m"
    })

    # Test that our values were set
    conf = source.spark.sparkContext.getConf().getAll()
    assert ("spark.app.name", "great_expectations") in conf
    assert ("spark.sql.catalogImplementation", "hive") in conf
    assert ("spark.executor.memory", "128m") in conf
def test_spark_config_datasource(spark_session_v012):
    name: str = "great_expectations-ds-config"
    spark_config: Dict[str, str] = {
        "spark.app.name": name,
        "spark.sql.catalogImplementation": "hive",
        "spark.executor.memory": "768m",
        # "spark.driver.allowMultipleContexts": "true",  # This directive does not appear to have any effect.
    }
    source: SparkDFDatasource = SparkDFDatasource(spark_config=spark_config)
    spark_session: SparkSession = source.spark
    # noinspection PyProtectedMember
    sc_stopped: bool = spark_session.sparkContext._jsc.sc().isStopped()
    assert not sc_stopped

    # Test that our values were set
    conf: List[tuple] = source.spark.sparkContext.getConf().getAll()
    assert ("spark.app.name", name) in conf
    assert ("spark.sql.catalogImplementation", "hive") in conf
    assert ("spark.executor.memory", "768m") in conf
Exemple #18
0
def _add_spark_datasource(
    context, passthrough_generator_only=True, prompt_for_datasource_name=True
):
    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "spark"},
        success=True,
    )

    if not _verify_pyspark_dependent_modules():
        return None

    if passthrough_generator_only:
        datasource_name = "files_spark_datasource"

        # configuration = SparkDFDatasource.build_configuration(batch_kwargs_generators={
        #     "default": {
        #         "class_name": "PassthroughGenerator",
        #     }
        # }
        # )
        configuration = SparkDFDatasource.build_configuration()

    else:
        path = click.prompt(
            msg_prompt_filesys_enter_base_path,
            type=click.Path(exists=True, file_okay=False),
        ).strip()
        if path.startswith("./"):
            path = path[2:]

        if path.endswith("/"):
            basenamepath = path[:-1]
        else:
            basenamepath = path

        datasource_name = os.path.basename(basenamepath) + "__dir"
        if prompt_for_datasource_name:
            datasource_name = click.prompt(
                msg_prompt_datasource_name, default=datasource_name
            )

        configuration = SparkDFDatasource.build_configuration(
            batch_kwargs_generators={
                "subdir_reader": {
                    "class_name": "SubdirReaderBatchKwargsGenerator",
                    "base_directory": os.path.join("..", path),
                }
            }
        )
        configuration["class_name"] = "SparkDFDatasource"
        configuration["module_name"] = "great_expectations.datasource"
        errors = DatasourceConfigSchema().validate(configuration)
        if len(errors) != 0:
            raise ge_exceptions.GreatExpectationsError(
                "Invalid Datasource configuration: {:s}".format(errors)
            )

    cli_message(
        """
Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml:

{:s}
""".format(
            datasource_name,
            textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), "  "),
        )
    )
    toolkit.confirm_proceed_or_exit()

    context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
def basic_sparkdf_datasource():
    return SparkDFDatasource("basic_sparkdf_datasource")
def basic_sparkdf_datasource(test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip(
            "Spark has not been enabled, so this test must be skipped.")
    return SparkDFDatasource("basic_sparkdf_datasource")