def test_materialize_dataset_hadoop_config(tmpdir_factory): """Test that using materialize_dataset does not alter the hadoop_config""" path = tmpdir_factory.mktemp('data').strpath tmp_url = "file://" + path # This test does not properly check if parquet.enable.summary-metadata is restored properly with pyspark < 2.4 spark = SparkSession.builder.getOrCreate() hadoop_config = spark.sparkContext._jsc.hadoopConfiguration() parquet_metadata_level = "COMMON_ONLY" parquet_row_group_check = 100 # Set the parquet summary files and row group size check min hadoop_config.set('parquet.summary.metadata.level', parquet_metadata_level) hadoop_config.setInt('parquet.row-group.size.row.check.min', parquet_row_group_check) assert hadoop_config.get('parquet.summary.metadata.level') == str( parquet_metadata_level) assert hadoop_config.get('parquet.row-group.size.row.check.min') == str( parquet_row_group_check) create_test_dataset(tmp_url, range(10), spark=spark) assert not os.path.exists(os.path.join(path, "_metadata")) # Check that they are back to the original values after writing the dataset hadoop_config = spark.sparkContext._jsc.hadoopConfiguration() assert hadoop_config.get('parquet.summary.metadata.level') == str( parquet_metadata_level) assert hadoop_config.get('parquet.row-group.size.row.check.min') == str( parquet_row_group_check) # Other options should return to being unset assert hadoop_config.get('parquet.block.size') is None assert hadoop_config.get('parquet.block.size.row.check.min') is None spark.stop()
def test_ngram_delta_small_threshold_tf(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) reader = Reader( schema_fields=ngram, dataset_url=tmp_url, reader_pool=DummyPool(), ) with tf.Session() as sess: with pytest.raises(OutOfRangeError): sess.run(tf_tensors(reader)) reader.stop() reader.join()
def test_materialize_dataset_hadoop_config(synthetic_dataset): """Test that using materialize_dataset does not alter the hadoop_config""" spark = SparkSession.builder.getOrCreate() hadoop_config = spark.sparkContext._jsc.hadoopConfiguration() parquet_summary_metadata = False parquet_row_group_check = 100 # Set the parquet summary giles and row group size check min hadoop_config.setBoolean('parquet.enable.summary-metadata', parquet_summary_metadata) hadoop_config.setInt('parquet.row-group.size.row.check.min', parquet_row_group_check) assert hadoop_config.get('parquet.enable.summary-metadata') == str( parquet_summary_metadata).lower() assert hadoop_config.get('parquet.row-group.size.row.check.min') == str( parquet_row_group_check) destination = synthetic_dataset.path + '_moved' create_test_dataset('file://{}'.format(destination), range(10), spark=spark) # Check that they are back to the original values after writing the dataset hadoop_config = spark.sparkContext._jsc.hadoopConfiguration() assert hadoop_config.get('parquet.enable.summary-metadata') == str( parquet_summary_metadata).lower() assert hadoop_config.get('parquet.row-group.size.row.check.min') == str( parquet_row_group_check) # Other options should return to being unset assert hadoop_config.get('parquet.block.size') is None assert hadoop_config.get('parquet.block.size.row.check.min') is None spark.stop() rmtree(destination)
def generate_dataset_for_legacy_test(): """Generates a test dataset and stores it into petastorm/tests/data/legacy/x.x.x folder. The version number is acquired automatically from petastorm.__version__""" dataset_name = petastorm.__version__ path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'legacy', dataset_name) url = 'file://' + path create_test_dataset(url, range(100))
def test_materialize_with_summary_metadata(tmpdir_factory): """Verify _summary_metadata appears, when requested""" path = tmpdir_factory.mktemp('data').strpath tmp_url = "file://" + path spark = SparkSession.builder.getOrCreate() create_test_dataset(tmp_url, range(10), spark=spark, use_summary_metadata=True) assert os.path.exists(os.path.join(path, "_metadata")) spark.stop()
def setUpClass(cls): """Initializes dataset once per test. All tests in this class will use the same fake dataset.""" # Write a fake dataset to this location cls._dataset_dir = mkdtemp('test_metadata_read') cls._dataset_url = 'file://{}'.format(cls._dataset_dir) cls._dataset_dicts = create_test_dataset(cls._dataset_url, range(ROWS_COUNT))
def dataset_num_files_1(tmpdir_factory): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path data = create_test_dataset(url, range(99), num_files=1) dataset = SyntheticDataset(url=url, path=path, data=data) return dataset
def dataset_0_3_8_10_11_20_23(tmpdir_factory): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path ids = [0, 3, 8, 10, 11, 20, 23] data = create_test_dataset(url, ids, num_files=1) dataset = SyntheticDataset(url=url, path=path, data=data) return dataset
def make_test_metadata(path): """ Use test_common to make a dataset for the TestSchema. :param path: path to store the test dataset :return: resulting dataset as a dictionary """ assert path, 'Please supply a nonempty path to store test dataset.' return create_test_dataset('file://{}'.format(path), range(ROWS_COUNT))
def setUpClass(cls): """Initializes dataset once per test. All tests in this class will use the same fake dataset.""" # Write a fake dataset to this location cls._dataset_dir = mkdtemp('end_to_end_petastorm') cls._dataset_url = 'file://{}'.format(cls._dataset_dir) ROWS_COUNT = 1000 cls._dataset_dicts = create_test_dataset(cls._dataset_url, range(ROWS_COUNT)) # Remove crc files due to https://issues.apache.org/jira/browse/HADOOP-7199 for crc_file in glob.glob(cls._dataset_dir + '/.*.crc'): os.remove(crc_file)
def test_ngram_delta_small_threshold(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=tmp_url, reader_pool=ThreadPool(10)) as reader: with pytest.raises(StopIteration): next(reader)
def _synthetic_dataset_no_cache(): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path data = create_test_dataset(url, range(_ROWS_COUNT)) dataset = SyntheticDataset(url=url, path=path, data=data) return dataset
def _dataset_generator(): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path ids = range(0, 99, 5) data = create_test_dataset(url, ids) return SyntheticDataset(url=url, path=path, data=data)
def _dataset_generator(): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path ids = [0, 3, 8, 10, 11, 20, 23] data = create_test_dataset(url, ids, num_files=1) return SyntheticDataset(url=url, path=path, data=data)
def _dataset_generator(): path = tmpdir_factory.mktemp("data").strpath url = 'file://' + path data = create_test_dataset(url, range(99), num_files=1) return SyntheticDataset(url=url, path=path, data=data)
def synthetic_dataset(tmpdir_factory): path = tmpdir_factory.mktemp('data').strpath url = 'file://' + path data = create_test_dataset(url, range(ROWS_COUNT)) return SyntheticDataset(url=url, path=path, data=data)