def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_parquet_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 5 # Limit should also work batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == 1 assert batch.data.count() == 2
def test_standalone_spark_parquet_datasource(test_parquet_folder_connection_path, spark_session): assert spark_session # Ensure a sparksession exists datasource = SparkDFDatasource('SparkParquet', base_directory=test_parquet_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet') }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 5 # Limit should also work dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_parquet_folder_connection_path, 'test.parquet'), "limit": 2 }) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == 1 assert dataset.spark_df.count() == 2
def test_standalone_spark_csv_datasource(test_folder_connection_path_csv, test_backends): if "SparkDFDataset" not in test_backends: pytest.skip( "Spark has not been enabled, so this test must be skipped.") datasource = SparkDFDatasource( "SparkParquet", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path_csv, } }, ) assert datasource.get_available_data_asset_names( )["subdir_reader"]["names"] == [("test", "file")] batch = datasource.get_batch( batch_kwargs={ "path": os.path.join(test_folder_connection_path_csv, "test.csv"), "reader_options": { "header": True }, }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()["col_1"] == "1"
def test_standalone_spark_csv_datasource(test_folder_connection_path): datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": set(['test']) } dataset = datasource.get_batch('test', header=True) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', base_directory=test_folder_connection_path) assert datasource.get_available_data_asset_names() == { "default": ['test'] } dataset = datasource.get_batch('test', expectation_suite_name="default", batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv') }, reader_options={"header": True}) assert isinstance(dataset, SparkDFDataset) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert dataset.spark_df.head()['col_1'] == '1'
def test_standalone_spark_csv_datasource(test_folder_connection_path): pyspark_skip = pytest.importorskip("pyspark") datasource = SparkDFDatasource('SparkParquet', generators={"subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": test_folder_connection_path } } ) assert datasource.get_available_data_asset_names()["subdir_reader"]["names"] == [('test', 'file')] batch = datasource.get_batch(batch_kwargs={ "path": os.path.join(test_folder_connection_path, 'test.csv'), "reader_options": {"header": True} }) assert isinstance(batch, Batch) # NOTE: below is a great example of CSV vs. Parquet typing: pandas reads content as string, spark as int assert batch.data.head()['col_1'] == '1'