def test_insert_explicit_nulls(self): TestSchema = Unischema('TestSchema', [ UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True), UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False), ]) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': 0, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], 0) self.assertEqual(row_dict['not_nullable'], 1) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': None, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], None) self.assertEqual(row_dict['not_nullable'], 1) # We are missing a nullable field here. insert_explicit_nulls should add a None entry. row_dict = {'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) self.assertEqual(len(row_dict), 2) self.assertEqual(row_dict['nullable'], None) self.assertEqual(row_dict['not_nullable'], 1) # We are missing a not_nullable field here. Should raise an ValueError. row_dict = {'nullable': 0} with self.assertRaises(ValueError): insert_explicit_nulls(TestSchema, row_dict)
def test_serialize_filesystem_factory(tmpdir): SimpleSchema = Unischema('SimpleSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False), ]) class BogusFS(pyarrow.LocalFileSystem): def __getstate__(self): raise RuntimeError("can not serialize") rows_count = 10 output_url = "file://{0}/fs_factory_test".format(tmpdir) rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext with materialize_dataset(spark, output_url, SimpleSchema, rowgroup_size_mb, filesystem_factory=BogusFS): rows_rdd = sc.parallelize(range(rows_count))\ .map(lambda x: {'id': x, 'foo': x})\ .map(lambda x: dict_to_spark_row(SimpleSchema, x)) spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \ .write \ .parquet(output_url)
def get_petastorm_column(df_column): column_type = df_column.type column_name = df_column.name column_is_nullable = df_column.is_nullable column_array_dimensions = df_column.array_dimensions # Reference: # https://github.com/uber/petastorm/blob/master/petastorm/ # tests/test_common.py petastorm_column = None if column_type == ColumnType.INTEGER: petastorm_column = UnischemaField(column_name, np.int32, (), ScalarCodec(IntegerType()), column_is_nullable) elif column_type == ColumnType.FLOAT: petastorm_column = UnischemaField(column_name, np.float64, (), ScalarCodec(FloatType()), column_is_nullable) elif column_type == ColumnType.TEXT: petastorm_column = UnischemaField(column_name, np.str_, (), ScalarCodec(StringType()), column_is_nullable) elif column_type == ColumnType.NDARRAY: petastorm_column = UnischemaField(column_name, np.uint8, column_array_dimensions, NdarrayCodec(), column_is_nullable) else: LoggingManager().log("Invalid column type: " + str(column_type), LoggingLevel.ERROR) return petastorm_column
def test_encode_scalar_float(): codec = ScalarCodec(FloatType()) expected = np.random.random(()).astype(np.float64) field = UnischemaField(name='field_float', numpy_dtype=np.float32, shape=(), codec=codec, nullable=False) encoded = codec.encode(field, expected) assert isinstance(encoded, float) assert expected == encoded
def test_create_schema_view_using_unischema_fields(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view([TestSchema.int_field]) assert set(view.fields.keys()) == {'int_field'}
def test_make_named_tuple(): TestSchema = Unischema('TestSchema', [ UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False), UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False), UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True), UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False), ]) TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20, int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10)) TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20, int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
def test_create_schema_view_no_field_matches_regex(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['bogus']) assert not view.fields
def test_insert_explicit_nulls(): TestSchema = Unischema('TestSchema', [ UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True), UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False), ]) # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': 0, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] == 0 assert row_dict['not_nullable'] == 1 # Insert_explicit_nulls to leave the dictionary as is. row_dict = {'nullable': None, 'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] is None assert row_dict['not_nullable'] == 1 # We are missing a nullable field here. insert_explicit_nulls should add a None entry. row_dict = {'not_nullable': 1} insert_explicit_nulls(TestSchema, row_dict) assert len(row_dict) == 2 assert row_dict['nullable'] is None assert row_dict['not_nullable'] == 1 # We are missing a not_nullable field here. Should raise an ValueError. row_dict = {'nullable': 0} with pytest.raises(ValueError): insert_explicit_nulls(TestSchema, row_dict)
def test_encode_scalar_string(): codec = ScalarCodec(StringType()) expected = 'surprise' field = UnischemaField(name='field_string', numpy_dtype=np.unicode_, shape=(), codec=codec, nullable=False) encoded = codec.encode(field, expected) assert isinstance(encoded, str) assert expected == encoded
def test_predicate_on_dataset(tmpdir): TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'test_field': x * x} blocklet_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb): rows_rdd = sc.parallelize(range(rows_count)) \ .map(test_row_generator) \ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .save(path=dataset_url, format='carbon') with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def test_create_schema_view_using_regex_and_unischema_fields(self): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['int.*$', TestSchema.string_field]) self.assertEqual(set(view.fields.keys()), {'int_field', 'string_field'})
def test_create_schema_view_using_invalid_type(): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with pytest.raises(ValueError, match='must be either a string'): TestSchema.create_schema_view([42])
def test_create_schema_view_fails_validate(): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with pytest.raises(ValueError, match='does not belong to the schema'): TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
def test_dict_to_spark_row_field_validation_scalar_nullable(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
def test_create_schema_view_using_regex_and_unischema_fields_with_duplicates(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['int.*$', TestSchema.int_field]) assert set(view.fields.keys()) == {'int_field'}
def test_create_schema_view_fails_validate(self): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with self.assertRaises(ValueError) as ex: TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)]) self.assertTrue('does not belong to the schema' in str(ex.exception))
def test_bad_unischema_field_shape(): codec = ScalarCodec(IntegerType()) field = UnischemaField(name='field_int', numpy_dtype=np.int32, shape=(1, ), codec=codec, nullable=False) with pytest.raises(ValueError, match='must be an empty tuple'): codec.encode(field, np.int32(1))
def test_bad_encoded_data_shape(): codec = ScalarCodec(IntegerType()) field = UnischemaField(name='field_int', numpy_dtype=np.int32, shape=(), codec=codec, nullable=False) with pytest.raises(TypeError): codec.decode(field, codec.encode(field, np.asarray([10, 10])))
def test_encode_non_scalar_type_is_passed(non_scalar_value): codec = ScalarCodec(FloatType()) field = UnischemaField(name='field_float', numpy_dtype=np.float32, shape=(), codec=codec, nullable=False) with pytest.raises(TypeError, match='Expected a scalar'): codec.encode(field, non_scalar_value)
def test_create_schema_view_using_invalid_type(self): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with self.assertRaises(ValueError) as ex: TestSchema.create_schema_view([42]) self.assertTrue('must be either a string' in str(ex.exception))
def test_encode_scalar_bool(): codec = ScalarCodec(BooleanType()) field = UnischemaField(name='field_bool', numpy_dtype=np.bool, shape=(), codec=codec, nullable=False) encoded = codec.encode(field, np.bool_(True)) assert isinstance(codec.encode(field, encoded), bool) assert encoded encoded = codec.encode(field, np.bool_(False)) assert not encoded
def test_scalar_codec_decimal(): codec = ScalarCodec(DecimalType(4, 3)) field = UnischemaField(name='field_decimal', numpy_dtype=Decimal, shape=(), codec=codec, nullable=False) value = Decimal('123.4567') assert codec.decode(field, codec.encode(field, value)) == value
def test_encode_scalar_int(): codec = ScalarCodec(IntegerType()) field = UnischemaField(name='field_int', numpy_dtype=np.int32, shape=(), codec=codec, nullable=False) encoded = codec.encode(field, np.int32(42)) assert isinstance(encoded, int) assert 42 == encoded
def test_numeric_types(spark_numpy_types): spark_type, numpy_type = spark_numpy_types codec = ScalarCodec(spark_type()) field = UnischemaField(name='field_int', numpy_dtype=numpy_type, shape=(), codec=codec, nullable=False) min_val, max_val = np.iinfo(numpy_type).min, np.iinfo(numpy_type).max assert codec.decode(field, codec.encode(field, numpy_type(min_val))) == min_val assert codec.decode(field, codec.encode(field, numpy_type(max_val))) == max_val
def test_fields(self): """Try using 'fields' getter""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) self.assertEqual(len(TestSchema.fields), 2) self.assertEqual(TestSchema.fields['int_field'].name, 'int_field') self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
def test_fields(): """Try using 'fields' getter""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) assert len(TestSchema.fields) == 2 assert TestSchema.fields['int_field'].name == 'int_field' assert TestSchema.fields['string_field'].name == 'string_field'
def main(train: str, test: str, target_train: str, target_test: str): # initialise logger logger = logging.getLogger(__file__) logger.addHandler(logging.StreamHandler()) logger.setLevel('INFO') logger.info('Initialising local spark') spark = init_local_spark() logger.info('Preparing schema') # petastorm schema schema = Unischema('data_schema', [ UnischemaField('time_window', np.str, (), ScalarCodec(StringType()), False), UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False), UnischemaField('feature', np.float32, (1, 69), CompressedNdarrayCodec(), False), UnischemaField('label', np.str, (), ScalarCodec(StringType()), True), ]) # processing train logger.info('Processing train parquet files') logger.info('Read parquet') train_feature_df = spark.read.parquet(train) logger.info('Composing features...') train_input = FeatureComposer(spark, train_feature_df).transform( remove_malicious=True, remove_null_label=True) logger.info('Changing schema...') train_input = change_df_schema(spark, schema, train_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, train_input, target_train, schema) logger.info('Train input done') # processing test logger.info('Processing test parquet files') logger.info('Read parquet') test_feature_df = spark.read.parquet(test) logger.info('Composing features...') test_input = FeatureComposer(spark, test_feature_df).transform( remove_malicious=False, remove_null_label=True) logger.info('Changing schema...') test_input = change_df_schema(spark, schema, test_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema) logger.info('Test input done')
def test_as_spark_schema(self): """Try using 'as_spark_schema' function""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) spark_schema = TestSchema.as_spark_schema() self.assertEqual(spark_schema.fields[0].name, 'int_field') self.assertEqual(spark_schema.fields[1].name, 'string_field') self.assertEqual(TestSchema.fields['int_field'].name, 'int_field') self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def from_arrow_schema(cls, parquet_dataset): """ Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will throw an exception. :param arrow_schema: :class:`pyarrow.lib.Schema` :return: A :class:`Unischema` object. """ meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() unischema_fields = [] for partition_name in parquet_dataset.partitions.partition_names: unischema_fields.append( UnischemaField(partition_name, np.str_, (), ScalarCodec(StringType()), False)) for column_name in arrow_schema.names: arrow_field = arrow_schema.field_by_name(column_name) field_type = arrow_field.type codec, np_type = _numpy_and_codec_from_arrow_type(field_type) unischema_fields.append( UnischemaField(column_name, np_type, (), codec, arrow_field.nullable)) return Unischema('inferred_schema', unischema_fields)