def test_read_limit(test_folder_connection_path_csv): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) batch_kwargs = PathBatchKwargs({ "path": os.path.join(str(test_folder_connection_path_csv), "test.csv"), # "reader_options": {"sep": ",", "header": 0, "index_col": 0}, "reader_options": { "sep": "," }, }) nested_update(batch_kwargs, datasource.process_batch_parameters(limit=1)) batch = datasource.get_batch(batch_kwargs=batch_kwargs) assert isinstance(batch, Batch) dataset = batch.data assert (dataset["col_1"] == [1]).all() assert len(dataset) == 1 # A datasource should always return an object with a typed batch_id assert isinstance(batch.batch_kwargs, PathBatchKwargs) assert isinstance(batch.batch_markers, BatchMarkers)
def test_standalone_pandas_datasource(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == {"default": ["test"]} manual_batch_kwargs = PathBatchKwargs( path=os.path.join(str(test_folder_connection_path), "test.csv")) # Get the default (subdir_path) generator generator = datasource.get_generator() auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... # Note that we are using get_data_asset NOT get_batch here, since we are standalone (no batch concept) dataset = datasource.get_data_asset("test", generator_name="default", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() ## A datasource should always return an object with a typed batch_id assert isinstance(dataset.batch_kwargs, PathBatchKwargs) assert isinstance(dataset.batch_id, BatchId) assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def test_process_batch_parameters(): batch_kwargs = PandasDatasource("test").process_batch_parameters(limit=1) assert batch_kwargs == {"reader_options": {"nrows": 1}} batch_kwargs = PandasDatasource("test").process_batch_parameters( dataset_options={"caching": False}) assert batch_kwargs == {"dataset_options": {"caching": False}}
def test_read_limit(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) dataset = datasource.get_data_asset( "test", generator_name="default", batch_kwargs=PathBatchKwargs({ "path": os.path.join(str(test_folder_connection_path), "test.csv"), "limit": 1 }), reader_options={ 'sep': ",", 'header': 0, 'index_col': 0 }) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1]).all() assert len(dataset) == 1 # A datasource should always return an object with a typed batch_id assert isinstance(dataset.batch_kwargs, PathBatchKwargs) assert isinstance(dataset.batch_id, BatchId) assert isinstance(dataset.batch_fingerprint, BatchFingerprint)
def _add_pandas_datasource(datasource_name: str, dataset: AbstractDataSet, ge_context: DataContext) -> str: from great_expectations.datasource import PandasDatasource path = str(dataset._filepath.parent) if path.startswith("./"): path = path[2:] configuration = PandasDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } }) configuration["class_name"] = "PandasDatasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {0:s}".format(errors)) ge_context.add_datasource(name=datasource_name, **configuration) return datasource_name
def _add_pandas_datasource(context): path = click.prompt( msg_prompt_filesys_enter_base_path, # default='/data/', type=click.Path(exists=True, file_okay=False, dir_okay=True, readable=True), show_default=True) if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path default_data_source_name = os.path.basename(basenamepath) + "__dir" data_source_name = click.prompt(msg_prompt_datasource_name, default=default_data_source_name, show_default=True) configuration = PandasDatasource.build_configuration( base_directory=os.path.join("..", path)) context.add_datasource(name=data_source_name, class_name='PandasDatasource', **configuration) return data_source_name
def test_pandas_datasource_processes_dataset_options( test_folder_connection_path): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path, } }, ) batch_kwargs = datasource.build_batch_kwargs("subdir_reader", name="test") batch_kwargs["dataset_options"] = {"caching": False} batch = datasource.get_batch(batch_kwargs) validator = Validator(batch, ExpectationSuite(expectation_suite_name="foo")) dataset = validator.get_dataset() assert dataset.caching is False
def test_standalone_pandas_datasource(test_folder_connection_path): datasource = PandasDatasource( "PandasCSV", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path, } }, ) assert datasource.get_available_data_asset_names() == { "subdir_reader": { "names": [("test", "file")], "is_complete_list": True } } manual_batch_kwargs = PathBatchKwargs( path=os.path.join(str(test_folder_connection_path), "test.csv")) generator = datasource.get_batch_kwargs_generator("subdir_reader") auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... auto_batch_kwargs.update( {"reader_options": { "sep": ",", "header": 0, "index_col": 0 }}) batch = datasource.get_batch(batch_kwargs=auto_batch_kwargs) assert isinstance(batch, Batch) dataset = batch.data assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all() assert len(dataset) == 5 # A datasource should always return an object with a typed batch_id assert isinstance(batch.batch_kwargs, PathBatchKwargs) assert isinstance(batch.batch_markers, BatchMarkers)
def test_invalid_reader_pandas_datasource(tmp_path_factory): basepath = str( tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource")) datasource = PandasDatasource( "mypandassource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": basepath, } }, ) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.value.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "blarg", }) assert "Unknown reader method: blarg" in exc.value.message batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "reader_method": "read_csv", "reader_options": { "header": 0 }, }) assert batch.data["a"][0] == 1
def test_standalone_pandas_datasource(test_folder_connection_path): datasource = PandasDatasource('PandasCSV', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == {"default": {"test"}} manual_batch_kwargs = datasource.build_batch_kwargs(os.path.join(str(test_folder_connection_path), "test.csv")) # Get the default (subdir_path) generator generator = datasource.get_generator() auto_batch_kwargs = generator.yield_batch_kwargs("test") assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"] # Include some extra kwargs... dataset = datasource.get_batch("test", batch_kwargs=auto_batch_kwargs, sep=",", header=0, index_col=0) assert isinstance(dataset, PandasDataset) assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()
def test_invalid_reader_pandas_datasource(tmp_path_factory): basepath = str( tmp_path_factory.mktemp("test_invalid_reader_pandas_datasource")) datasource = PandasDatasource('mypandassource', base_directory=basepath) with open(os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized"), "w") as newfile: newfile.write("a,b\n1,2\n3,4\n") with pytest.raises(BatchKwargsError) as exc: datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized") }) assert "Unable to determine reader for path" in exc.message with pytest.raises(BatchKwargsError) as exc: datasource.get_batch("idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join( basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="blarg") assert "Unknown reader method: blarg" in exc.message dataset = datasource.get_batch( "idonotlooklikeacsvbutiam.notrecognized", expectation_suite_name="default", batch_kwargs={ "path": os.path.join(basepath, "idonotlooklikeacsvbutiam.notrecognized") }, reader_method="csv", header=0) assert dataset["a"][0] == 1
def test_infer_default_options_partial_functions(reader_fn): datasource = PandasDatasource() reader_fn_partial = partial(reader_fn) assert datasource._infer_default_options( reader_fn_partial, {}) == datasource._infer_default_options(reader_fn, {})
def basic_pandas_datasource(): return PandasDatasource("basic_pandas_datasource")
def test_process_batch_parameters(): batch_kwargs = PandasDatasource("test").process_batch_parameters(limit=1) assert batch_kwargs == {"reader_options": {"nrows": 1}}
def _add_pandas_datasource( context, passthrough_generator_only=True, prompt_for_datasource_name=True ): toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "pandas"}, success=True, ) if passthrough_generator_only: datasource_name = "files_datasource" configuration = PandasDatasource.build_configuration() else: path = click.prompt( msg_prompt_filesys_enter_base_path, type=click.Path(exists=True, file_okay=False), ) if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path datasource_name = os.path.basename(basenamepath) + "__dir" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) configuration = PandasDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } } ) configuration["class_name"] = "PandasDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml: {:s} """.format( datasource_name, textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), " "), ) ) toolkit.confirm_proceed_or_exit( continuation_message="Okay, exiting now. To learn more about adding datasources, run great_expectations " "datasource --help or visit https://docs.greatexpectations.io/" ) context.add_datasource(name=datasource_name, **configuration) return datasource_name