def test_predicate_on_dataset(tmpdir): TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'test_field': x * x} blocklet_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb): rows_rdd = sc.parallelize(range(rows_count)) \ .map(test_row_generator) \ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .save(path=dataset_url, format='carbon') with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def test_some_processing_functions(synthetic_dataset, reader_factory): """Try several ``tf.data.Dataset`` dataset operations on make_petastorm_dataset""" # reader1 will have a single row with id=1, reader2: a single row with id=2 # Using functools.partial(_eq, 1)) which is equivalent to lambda x: x==1 because standard python pickle # can not pickle this lambda with reader_factory(synthetic_dataset.url, predicate=in_lambda(['id'], functools.partial(operator.eq, 1))) as reader1: with reader_factory(synthetic_dataset.url, predicate=in_lambda(['id'], functools.partial( operator.eq, 2))) as reader2: dataset = make_petastorm_dataset(reader1) \ .prefetch(10) \ .concatenate(make_petastorm_dataset(reader2)) \ .map(lambda x: x.id) \ .batch(2) next_sample = dataset.make_one_shot_iterator().get_next() with tf.Session() as sess: # 'actual' is expected to be content of id column of a concatenated dataset actual = sess.run(next_sample) np.testing.assert_array_equal(actual, [1, 2])
def test_invalid_batch_carbon_reader_predicate_parameters(carbon_scalar_dataset): with make_batch_carbon_reader(carbon_scalar_dataset.url, cache_type="memory-cache", predicate=in_lambda(['id2'], lambda id2: True)) as reader: with pytest.raises(RuntimeError): next(reader) with make_batch_carbon_reader(carbon_scalar_dataset.url, predicate=in_lambda([], lambda x: False)) as reader: with pytest.raises(ValueError): next(reader) with make_batch_carbon_reader(carbon_scalar_dataset.url, predicate=in_lambda(['not_exist_col'], lambda x: False)) as reader: with pytest.raises(ValueError): next(reader)
def read(self, table: DataFrameMetadata, columns: List[str] = None, predicate_func=None) -> Iterator[Batch]: """ Reads the table and return a batch iterator for the tuples that passes the predicate func. Argument: table: table metadata object to write into columns List[str]: A list of column names to be considered in predicate_func predicate_func: customized predicate function returns bool Return: Iterator of Batch read. """ predicate = None if predicate_func and columns: predicate = in_lambda(columns, predicate_func) # ToDo: Handle the sharding logic. We might have to maintain a # context for deciding which shard to read petastorm_reader = PetastormReader(self._spark_url(table), predicate=predicate) for batch in petastorm_reader.read(): yield batch
def test_custom_function(all_values): for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']: test_predicate = in_lambda( ['volume_guids'], lambda volume_guids, val=value: val in volume_guids) included = test_predicate.do_include({'volume_guids': all_values}) assert included == (value in all_values)
def test_real_reader(synthetic_dataset): readers = [make_reader(synthetic_dataset.url, predicate=in_lambda(['id'], lambda id: id % 2 == 0), num_epochs=None, reader_pool_type='dummy'), make_reader(synthetic_dataset.url, predicate=in_lambda(['id'], lambda id: id % 2 == 1), num_epochs=None, reader_pool_type='dummy')] results = [0, 0] num_of_reads = 300 with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer: # Piggyback on this test to verify container interface of the WeightedSamplingReader for i, sample in enumerate(mixer): next_id = sample.id % 2 results[next_id] += 1 if i >= num_of_reads: break np.testing.assert_allclose(results, [num_of_reads * 0.5, num_of_reads * 0.5], atol=num_of_reads / 10)
def test_predicate_on_partitioned_dataset(tmpdir): """ Generates a partitioned dataset and ensures that readers evaluate the type of the partition column according to the type given in the Unischema. """ TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'id2': x + 1, 'test_field': x * x} rowgroup_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb): rows_rdd = sc.parallelize(range(rows_count))\ .map(test_row_generator)\ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .partitionBy('id', 'id2') \ .parquet(dataset_url) with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_reader(dataset_url, predicate=in_lambda(['id2'], lambda x: x == 5)) as reader: assert next(reader).id == 5 with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def test_custom_function(self): for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']: test_predicate = in_lambda( ['volume_guids'], lambda volume_guids, val=value: val in volume_guids) included = test_predicate.do_include( {'volume_guids': PredicatesTest.all_values}) self.assertEqual(included, value in PredicatesTest.all_values)
def test_predicate_on_single_column(carbon_synthetic_dataset): reader = make_carbon_reader(carbon_synthetic_dataset.url, schema_fields=[TestSchema.id2], predicate=in_lambda(['id2'], lambda id2: True), reader_pool_type='thread') counter = 0 for row in reader: counter += 1 actual = dict(row._asdict()) assert actual['id2'] < 2 assert counter == len(carbon_synthetic_dataset.data)
def test_custom_function_with_state(all_values): counter = [0] def pred_func(volume_guids, cntr): cntr[0] += 1 return volume_guids in all_values test_predicate = in_lambda(['volume_guids'], pred_func, counter) for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']: included = test_predicate.do_include({'volume_guids': value}) assert included == (value in all_values) assert counter[0] == 5
def test_custom_function_with_state(self): counter = [0] def pred_func(volume_guids, cntr): cntr[0] += 1 return volume_guids in PredicatesTest.all_values test_predicate = in_lambda(['volume_guids'], pred_func, counter) for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']: included = test_predicate.do_include({'volume_guids': value}) self.assertEqual(included, value in PredicatesTest.all_values) self.assertEqual(counter[0], 5)
def test_transform_function_with_predicate_batched(scalar_dataset): def double_float64(sample): assert all(sample['id'] % 2 == 0) sample['float64'] *= 2 return sample with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64), predicate=in_lambda(['id'], lambda id: id % 2 == 0)) as reader: actual = next(reader) for actual_id, actual_float64 in zip(actual.id, actual.float64): assert actual_id % 2 == 0 original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id) expected_matrix = original_sample['float64'] * 2 np.testing.assert_equal(expected_matrix, actual_float64)
def test_transform_function_with_predicate(synthetic_dataset, reader_factory): """Make sure we apply transform only after we apply the predicate""" with reader_factory( synthetic_dataset.url, schema_fields=[TestSchema.id, TestSchema.id2], predicate=in_lambda(['id2'], lambda id2: id2 == 1), transform_spec=TransformSpec(removed_fields=['id2'])) as reader: rows = list(reader) assert 'id2' not in rows[0]._fields actual_ids = np.asarray(list(row.id for row in rows)) assert actual_ids.size > 0 # In the test data id2 = id % 2, which means we expect only odd ids to remain after # we apply lambda id2: id2 == 1 predicate. assert np.all(actual_ids % 2 == 1)
def test_transform_function_returns_a_new_dict_with_predicate( synthetic_dataset, reader_factory): def transform(sample): return {'id': sample['id'], 'id2': -1} with reader_factory( synthetic_dataset.url, schema_fields=[TestSchema.id, TestSchema.id2], predicate=in_lambda(['id2'], lambda id2: id2 == 1), transform_spec=TransformSpec(func=transform)) as reader: rows = list(reader) actual_ids = np.asarray(list(row.id for row in rows)) assert actual_ids.size > 0 # In the test data id2 = id % 2, which means we expect only odd ids to remain after # we apply lambda id2: id2 == 1 predicate. assert np.all(actual_ids % 2 == 1) transformed_ids = np.asarray(list(row.id2 for row in rows)) assert np.all(transformed_ids == -1)