def test_materialize_dataset_hadoop_config(tmpdir_factory):
    """Test that using materialize_dataset does not alter the hadoop_config"""

    path = tmpdir_factory.mktemp('data').strpath
    tmp_url = "file://" + path
    # This test does not properly check if parquet.enable.summary-metadata is restored properly with pyspark < 2.4
    spark = SparkSession.builder.getOrCreate()
    hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()

    parquet_metadata_level = "COMMON_ONLY"
    parquet_row_group_check = 100

    # Set the parquet summary files and row group size check min
    hadoop_config.set('parquet.summary.metadata.level', parquet_metadata_level)
    hadoop_config.setInt('parquet.row-group.size.row.check.min',
                         parquet_row_group_check)
    assert hadoop_config.get('parquet.summary.metadata.level') == str(
        parquet_metadata_level)
    assert hadoop_config.get('parquet.row-group.size.row.check.min') == str(
        parquet_row_group_check)

    create_test_dataset(tmp_url, range(10), spark=spark)

    assert not os.path.exists(os.path.join(path, "_metadata"))

    # Check that they are back to the original values after writing the dataset
    hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
    assert hadoop_config.get('parquet.summary.metadata.level') == str(
        parquet_metadata_level)
    assert hadoop_config.get('parquet.row-group.size.row.check.min') == str(
        parquet_row_group_check)
    # Other options should return to being unset
    assert hadoop_config.get('parquet.block.size') is None
    assert hadoop_config.get('parquet.block.size.row.check.min') is None
    spark.stop()
Exemple #2
0
def test_ngram_delta_small_threshold_tf():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        reader = Reader(
            schema_fields=ngram,
            dataset_url=tmp_url,
            reader_pool=DummyPool(),
        )

        with tf.Session() as sess:
            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))

        reader.stop()
        reader.join()
Exemple #3
0
def test_materialize_dataset_hadoop_config(synthetic_dataset):
    """Test that using materialize_dataset does not alter the hadoop_config"""
    spark = SparkSession.builder.getOrCreate()
    hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()

    parquet_summary_metadata = False
    parquet_row_group_check = 100

    # Set the parquet summary giles and row group size check min
    hadoop_config.setBoolean('parquet.enable.summary-metadata',
                             parquet_summary_metadata)
    hadoop_config.setInt('parquet.row-group.size.row.check.min',
                         parquet_row_group_check)
    assert hadoop_config.get('parquet.enable.summary-metadata') == str(
        parquet_summary_metadata).lower()
    assert hadoop_config.get('parquet.row-group.size.row.check.min') == str(
        parquet_row_group_check)
    destination = synthetic_dataset.path + '_moved'
    create_test_dataset('file://{}'.format(destination),
                        range(10),
                        spark=spark)

    # Check that they are back to the original values after writing the dataset
    hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
    assert hadoop_config.get('parquet.enable.summary-metadata') == str(
        parquet_summary_metadata).lower()
    assert hadoop_config.get('parquet.row-group.size.row.check.min') == str(
        parquet_row_group_check)
    # Other options should return to being unset
    assert hadoop_config.get('parquet.block.size') is None
    assert hadoop_config.get('parquet.block.size.row.check.min') is None
    spark.stop()
    rmtree(destination)
Exemple #4
0
def generate_dataset_for_legacy_test():
    """Generates a test dataset and stores it into petastorm/tests/data/legacy/x.x.x folder. The version number
    is acquired automatically from petastorm.__version__"""
    dataset_name = petastorm.__version__
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data',
                        'legacy', dataset_name)
    url = 'file://' + path

    create_test_dataset(url, range(100))
def test_materialize_with_summary_metadata(tmpdir_factory):
    """Verify _summary_metadata appears, when requested"""
    path = tmpdir_factory.mktemp('data').strpath
    tmp_url = "file://" + path

    spark = SparkSession.builder.getOrCreate()
    create_test_dataset(tmp_url,
                        range(10),
                        spark=spark,
                        use_summary_metadata=True)

    assert os.path.exists(os.path.join(path, "_metadata"))
    spark.stop()
 def setUpClass(cls):
     """Initializes dataset once per test. All tests in this class will use the same fake dataset."""
     # Write a fake dataset to this location
     cls._dataset_dir = mkdtemp('test_metadata_read')
     cls._dataset_url = 'file://{}'.format(cls._dataset_dir)
     cls._dataset_dicts = create_test_dataset(cls._dataset_url,
                                              range(ROWS_COUNT))
Exemple #7
0
def dataset_num_files_1(tmpdir_factory):
    path = tmpdir_factory.mktemp("data").strpath
    url = 'file://' + path
    data = create_test_dataset(url, range(99), num_files=1)
    dataset = SyntheticDataset(url=url, path=path, data=data)

    return dataset
Exemple #8
0
def dataset_0_3_8_10_11_20_23(tmpdir_factory):
    path = tmpdir_factory.mktemp("data").strpath
    url = 'file://' + path
    ids = [0, 3, 8, 10, 11, 20, 23]
    data = create_test_dataset(url, ids, num_files=1)
    dataset = SyntheticDataset(url=url, path=path, data=data)

    return dataset
def make_test_metadata(path):
    """
    Use test_common to make a dataset for the TestSchema.

    :param path: path to store the test dataset
    :return: resulting dataset as a dictionary
    """
    assert path, 'Please supply a nonempty path to store test dataset.'
    return create_test_dataset('file://{}'.format(path), range(ROWS_COUNT))
Exemple #10
0
    def setUpClass(cls):
        """Initializes dataset once per test. All tests in this class will use the same fake dataset."""
        # Write a fake dataset to this location
        cls._dataset_dir = mkdtemp('end_to_end_petastorm')
        cls._dataset_url = 'file://{}'.format(cls._dataset_dir)
        ROWS_COUNT = 1000
        cls._dataset_dicts = create_test_dataset(cls._dataset_url, range(ROWS_COUNT))

        # Remove crc files due to https://issues.apache.org/jira/browse/HADOOP-7199
        for crc_file in glob.glob(cls._dataset_dir + '/.*.crc'):
            os.remove(crc_file)
Exemple #11
0
def test_ngram_delta_small_threshold():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        with Reader(schema_fields=ngram,
                    dataset_url=tmp_url,
                    reader_pool=ThreadPool(10)) as reader:
            with pytest.raises(StopIteration):
                next(reader)
Exemple #12
0
 def _synthetic_dataset_no_cache():
     path = tmpdir_factory.mktemp("data").strpath
     url = 'file://' + path
     data = create_test_dataset(url, range(_ROWS_COUNT))
     dataset = SyntheticDataset(url=url, path=path, data=data)
     return dataset
Exemple #13
0
 def _dataset_generator():
     path = tmpdir_factory.mktemp("data").strpath
     url = 'file://' + path
     ids = range(0, 99, 5)
     data = create_test_dataset(url, ids)
     return SyntheticDataset(url=url, path=path, data=data)
Exemple #14
0
 def _dataset_generator():
     path = tmpdir_factory.mktemp("data").strpath
     url = 'file://' + path
     ids = [0, 3, 8, 10, 11, 20, 23]
     data = create_test_dataset(url, ids, num_files=1)
     return SyntheticDataset(url=url, path=path, data=data)
Exemple #15
0
 def _dataset_generator():
     path = tmpdir_factory.mktemp("data").strpath
     url = 'file://' + path
     data = create_test_dataset(url, range(99), num_files=1)
     return SyntheticDataset(url=url, path=path, data=data)
Exemple #16
0
def synthetic_dataset(tmpdir_factory):
    path = tmpdir_factory.mktemp('data').strpath
    url = 'file://' + path
    data = create_test_dataset(url, range(ROWS_COUNT))
    return SyntheticDataset(url=url, path=path, data=data)