def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)

    assert datasource.get_available_data_asset_names() == {"default": ["test"]}
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path), "test.csv"))

    # Get the default (subdir_path) generator
    generator = datasource.get_generator()
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    # Note that we are using get_data_asset NOT get_batch here, since we are standalone (no batch concept)
    dataset = datasource.get_data_asset("test",
                                        generator_name="default",
                                        batch_kwargs=auto_batch_kwargs,
                                        sep=",",
                                        header=0,
                                        index_col=0)
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()

    ## A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_read_limit(test_folder_connection_path):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path,
            }
        },
    )

    batch_kwargs = PathBatchKwargs({
        "path":
        os.path.join(str(test_folder_connection_path), "test.csv"),
        "reader_options": {
            "sep": ",",
            "header": 0,
            "index_col": 0
        },
    })
    nested_update(batch_kwargs, datasource.process_batch_parameters(limit=1))

    batch = datasource.get_batch(batch_kwargs=batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)
Exemple #3
0
def test_read_limit(test_folder_connection_path):
    datasource = PandasDatasource('PandasCSV',
                                  base_directory=test_folder_connection_path)
    dataset = datasource.get_data_asset(
        "test",
        generator_name="default",
        batch_kwargs=PathBatchKwargs({
            "path":
            os.path.join(str(test_folder_connection_path), "test.csv"),
            "limit":
            1
        }),
        reader_options={
            'sep': ",",
            'header': 0,
            'index_col': 0
        })
    assert isinstance(dataset, PandasDataset)
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(dataset.batch_kwargs, PathBatchKwargs)
    assert isinstance(dataset.batch_id, BatchId)
    assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_standalone_pandas_datasource(test_folder_connection_path):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path,
            }
        },
    )

    assert datasource.get_available_data_asset_names() == {
        "subdir_reader": {
            "names": [("test", "file")],
            "is_complete_list": True
        }
    }
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path), "test.csv"))

    generator = datasource.get_batch_kwargs_generator("subdir_reader")
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    auto_batch_kwargs.update(
        {"reader_options": {
            "sep": ",",
            "header": 0,
            "index_col": 0
        }})
    batch = datasource.get_batch(batch_kwargs=auto_batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()
    assert len(dataset) == 5

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)
def test_render_full_static_site_from_empty_project(tmp_path_factory,
                                                    filesystem_csv_3):

    # TODO : Use a standard test fixture
    # TODO : Have that test fixture copy a directory, rather than building a new one from scratch

    base_dir = str(tmp_path_factory.mktemp("project_dir"))
    project_dir = os.path.join(base_dir, "project_path")
    os.mkdir(project_dir)

    os.makedirs(os.path.join(project_dir, "data"))
    os.makedirs(os.path.join(project_dir, "data/titanic"))
    shutil.copy(file_relative_path(__file__, "../test_sets/Titanic.csv"),
                str(os.path.join(project_dir, "data/titanic/Titanic.csv")))

    os.makedirs(os.path.join(project_dir, "data/random"))
    shutil.copy(os.path.join(filesystem_csv_3, "f1.csv"),
                str(os.path.join(project_dir, "data/random/f1.csv")))
    shutil.copy(os.path.join(filesystem_csv_3, "f2.csv"),
                str(os.path.join(project_dir, "data/random/f2.csv")))

    assert gen_directory_tree_str(project_dir) == """\
project_path/
    data/
        random/
            f1.csv
            f2.csv
        titanic/
            Titanic.csv
"""

    context = DataContext.create(project_dir)
    ge_directory = os.path.join(project_dir, "great_expectations")
    context.add_datasource("titanic",
                           module_name="great_expectations.datasource",
                           class_name="PandasDatasource",
                           generators={
                               "subdir_reader": {
                                   "class_name":
                                   "SubdirReaderBatchKwargsGenerator",
                                   "base_directory":
                                   os.path.join(project_dir, "data/titanic/")
                               }
                           })

    context.add_datasource("random",
                           module_name="great_expectations.datasource",
                           class_name="PandasDatasource",
                           generators={
                               "subdir_reader": {
                                   "class_name":
                                   "SubdirReaderBatchKwargsGenerator",
                                   "base_directory":
                                   os.path.join(project_dir, "data/random/")
                               }
                           })

    context.profile_datasource("titanic")

    # Replicate the batch id of the batch that will be profiled in order to generate the file path of the
    # validation result
    titanic_profiled_batch_id = PathBatchKwargs({
        'path':
        os.path.join(project_dir, 'data/titanic/Titanic.csv'),
        'datasource':
        'titanic'
    }).to_id()

    tree_str = gen_directory_tree_str(project_dir)
    assert tree_str == """project_path/
    data/
        random/
            f1.csv
            f2.csv
        titanic/
            Titanic.csv
    great_expectations/
        .gitignore
        great_expectations.yml
        expectations/
            titanic/
                subdir_reader/
                    Titanic/
                        BasicDatasetProfiler.json
        notebooks/
            pandas/
                validation_playground.ipynb
            spark/
                validation_playground.ipynb
            sql/
                validation_playground.ipynb
        plugins/
            custom_data_docs/
                renderers/
                styles/
                    data_docs_custom_styles.css
                views/
        uncommitted/
            config_variables.yml
            data_docs/
            samples/
            validations/
                titanic/
                    subdir_reader/
                        Titanic/
                            BasicDatasetProfiler/
                                profiling/
                                    {}.json
""".format(titanic_profiled_batch_id)

    context.profile_datasource("random")
    context.build_data_docs()

    f1_profiled_batch_id = PathBatchKwargs({
        'path':
        os.path.join(project_dir, 'data/random/f1.csv'),
        'datasource':
        'random'
    }).to_id()

    f2_profiled_batch_id = PathBatchKwargs({
        'path':
        os.path.join(project_dir, 'data/random/f2.csv'),
        'datasource':
        'random'
    }).to_id()

    data_docs_dir = os.path.join(project_dir,
                                 "great_expectations/uncommitted/data_docs")
    observed = gen_directory_tree_str(data_docs_dir)
    assert observed == """\
data_docs/
    local_site/
        index.html
        expectations/
            random/
                subdir_reader/
                    f1/
                        BasicDatasetProfiler.html
                    f2/
                        BasicDatasetProfiler.html
            titanic/
                subdir_reader/
                    Titanic/
                        BasicDatasetProfiler.html
        static/
            fonts/
                HKGrotesk/
                    HKGrotesk-Bold.otf
                    HKGrotesk-BoldItalic.otf
                    HKGrotesk-Italic.otf
                    HKGrotesk-Light.otf
                    HKGrotesk-LightItalic.otf
                    HKGrotesk-Medium.otf
                    HKGrotesk-MediumItalic.otf
                    HKGrotesk-Regular.otf
                    HKGrotesk-SemiBold.otf
                    HKGrotesk-SemiBoldItalic.otf
            images/
                favicon.ico
                glossary_scroller.gif
                iterative-dev-loop.png
                logo-long-vector.svg
                logo-long.png
                short-logo-vector.svg
                short-logo.png
                validation_failed_unexpected_values.gif
            styles/
                data_docs_custom_styles_template.css
                data_docs_default_styles.css
        validations/
            random/
                subdir_reader/
                    f1/
                        BasicDatasetProfiler/
                            profiling/
                                {0:s}.html
                    f2/
                        BasicDatasetProfiler/
                            profiling/
                                {1:s}.html
            titanic/
                subdir_reader/
                    Titanic/
                        BasicDatasetProfiler/
                            profiling/
                                {2:s}.html
""".format(f1_profiled_batch_id, f2_profiled_batch_id,
           titanic_profiled_batch_id)

    # save data_docs locally
    safe_mmkdir("./tests/data_context/output")
    safe_mmkdir("./tests/data_context/output/data_docs")

    if os.path.isdir("./tests/data_context/output/data_docs"):
        shutil.rmtree("./tests/data_context/output/data_docs")
    shutil.copytree(os.path.join(ge_directory, "uncommitted/data_docs/"),
                    "./tests/data_context/output/data_docs")