Exemple #1
0
def test_sqlalchemy_datasource_processes_dataset_options(
        test_db_connection_string):
    datasource = SqlAlchemyDatasource(
        "SqlAlchemy", credentials={"url": test_db_connection_string})
    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": False})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False

    batch_kwargs = datasource.process_batch_parameters(
        dataset_options={"caching": True})
    batch_kwargs["query"] = "select * from table_1;"
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is True

    batch_kwargs = {
        "query": "select * from table_1;",
        "dataset_options": {
            "caching": False
        },
    }
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch,
                          ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
Exemple #2
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path):
    datasource = SparkDFDatasource('PandasCSV', generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
Exemple #3
0
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends):
    if "SparkDFDataset" not in test_backends:
        pytest.skip("Spark has not been enabled, so this test must be skipped.")
    datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path
            }
        }
    )
    batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test")
    batch_kwargs["dataset_options"] = {"caching": False, "persist": False}
    batch = datasource.get_batch(batch_kwargs)
    validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo"))
    dataset = validator.get_dataset()
    assert dataset.caching is False
    assert dataset._persist is False
    def _run_suite(
        self,
        dataset_name: str,
        dataset_path: Optional[str],
        df: Any,
        target_expectation_suite_name: str,
        run_id: str,
    ):
        target_suite = self.expectation_context.get_expectation_suite(
            target_expectation_suite_name)
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_kwargs = {"datasource": generate_datasource_name(dataset_name)}

        if dataset_path:
            dataasset_name, _ = os.path.splitext(
                os.path.basename(dataset_path))
            batch_kwargs["path"] = str(dataset_path)
            batch_kwargs["data_asset_name"] = dataasset_name

        batch = Batch(
            "kedro",
            batch_kwargs=BatchKwargs(batch_kwargs),
            data=df,
            batch_parameters=None,
            batch_markers=batch_markers,
            data_context=self.expectation_context,
        )

        try:
            v = Validator(
                batch=batch,
                expectation_suite=target_suite,
            )
        except ValueError:
            raise UnsupportedDataSet

        validator_dataset_batch = v.get_dataset()
        return self.expectation_context.run_validation_operator(
            "action_list_operator", [validator_dataset_batch], run_id=run_id)
Exemple #5
0
 def _run_suite(self, dataset, target_expectation_suite_name, run_id):
     class_name = self._get_ge_class_name(dataset)
     target_suite = self.expectation_context.get_expectation_suite(
         target_expectation_suite_name)
     df = dataset.load()
     batch = Batch(
         'kedro', BatchKwargs({
             'path': 'kedro',
             'datasource': 'kedro'
         }), df, None,
         BatchMarkers({
             "ge_load_time":
             datetime.datetime.now(
                 datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
         }), self.expectation_context)
     v = Validator(batch, target_suite, {
         'module_name': 'great_expectations.dataset',
         'class_name': class_name
     })
     vgdf = v.get_dataset()
     self.expectation_context.run_validation_operator(
         'action_list_operator', [vgdf], run_id=run_id)