def test_sqlalchemy_datasource_processes_dataset_options( test_db_connection_string): datasource = SqlAlchemyDatasource( "SqlAlchemy", credentials={"url": test_db_connection_string}) batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": False}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False batch_kwargs = datasource.process_batch_parameters( dataset_options={"caching": True}) batch_kwargs["query"] = "select * from table_1;" batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is True batch_kwargs = { "query": "select * from table_1;", "dataset_options": { "caching": False }, } batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path): datasource = SparkDFDatasource('PandasCSV', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def test_pandas_datasource_processes_dataset_options(test_folder_connection_path, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip("Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource('PandasCSV', batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False, "persist": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False assert dataset._persist is False
def _run_suite( self, dataset_name: str, dataset_path: Optional[str], df: Any, target_expectation_suite_name: str, run_id: str, ): target_suite = self.expectation_context.get_expectation_suite( target_expectation_suite_name) batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_kwargs = {"datasource": generate_datasource_name(dataset_name)} if dataset_path: dataasset_name, _ = os.path.splitext( os.path.basename(dataset_path)) batch_kwargs["path"] = str(dataset_path) batch_kwargs["data_asset_name"] = dataasset_name batch = Batch( "kedro", batch_kwargs=BatchKwargs(batch_kwargs), data=df, batch_parameters=None, batch_markers=batch_markers, data_context=self.expectation_context, ) try: v = Validator( batch=batch, expectation_suite=target_suite, ) except ValueError: raise UnsupportedDataSet validator_dataset_batch = v.get_dataset() return self.expectation_context.run_validation_operator( "action_list_operator", [validator_dataset_batch], run_id=run_id)
def _run_suite(self, dataset, target_expectation_suite_name, run_id): class_name = self._get_ge_class_name(dataset) target_suite = self.expectation_context.get_expectation_suite( target_expectation_suite_name) df = dataset.load() batch = Batch( 'kedro', BatchKwargs({ 'path': 'kedro', 'datasource': 'kedro' }), df, None, BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }), self.expectation_context) v = Validator(batch, target_suite, { 'module_name': 'great_expectations.dataset', 'class_name': class_name }) vgdf = v.get_dataset() self.expectation_context.run_validation_operator( 'action_list_operator', [vgdf], run_id=run_id)