コード例 #1
0
    def test_insert_explicit_nulls(self):
        TestSchema = Unischema('TestSchema', [
            UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True),
            UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False),
        ])

        # Insert_explicit_nulls to leave the dictionary as is.
        row_dict = {'nullable': 0, 'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], 0)
        self.assertEqual(row_dict['not_nullable'], 1)

        # Insert_explicit_nulls to leave the dictionary as is.
        row_dict = {'nullable': None, 'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], None)
        self.assertEqual(row_dict['not_nullable'], 1)

        # We are missing a nullable field here. insert_explicit_nulls should add a None entry.
        row_dict = {'not_nullable': 1}
        insert_explicit_nulls(TestSchema, row_dict)
        self.assertEqual(len(row_dict), 2)
        self.assertEqual(row_dict['nullable'], None)
        self.assertEqual(row_dict['not_nullable'], 1)

        # We are missing a not_nullable field here. Should raise an ValueError.
        row_dict = {'nullable': 0}
        with self.assertRaises(ValueError):
            insert_explicit_nulls(TestSchema, row_dict)
コード例 #2
0
def test_serialize_filesystem_factory(tmpdir):
    SimpleSchema = Unischema('SimpleSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False),
    ])

    class BogusFS(pyarrow.LocalFileSystem):
        def __getstate__(self):
            raise RuntimeError("can not serialize")

    rows_count = 10
    output_url = "file://{0}/fs_factory_test".format(tmpdir)
    rowgroup_size_mb = 256
    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext
    with materialize_dataset(spark,
                             output_url,
                             SimpleSchema,
                             rowgroup_size_mb,
                             filesystem_factory=BogusFS):
        rows_rdd = sc.parallelize(range(rows_count))\
            .map(lambda x: {'id': x, 'foo': x})\
            .map(lambda x: dict_to_spark_row(SimpleSchema, x))

        spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \
            .write \
            .parquet(output_url)
コード例 #3
0
    def get_petastorm_column(df_column):

        column_type = df_column.type
        column_name = df_column.name
        column_is_nullable = df_column.is_nullable
        column_array_dimensions = df_column.array_dimensions

        # Reference:
        # https://github.com/uber/petastorm/blob/master/petastorm/
        # tests/test_common.py

        petastorm_column = None
        if column_type == ColumnType.INTEGER:
            petastorm_column = UnischemaField(column_name, np.int32, (),
                                              ScalarCodec(IntegerType()),
                                              column_is_nullable)
        elif column_type == ColumnType.FLOAT:
            petastorm_column = UnischemaField(column_name, np.float64, (),
                                              ScalarCodec(FloatType()),
                                              column_is_nullable)
        elif column_type == ColumnType.TEXT:
            petastorm_column = UnischemaField(column_name, np.str_, (),
                                              ScalarCodec(StringType()),
                                              column_is_nullable)
        elif column_type == ColumnType.NDARRAY:
            petastorm_column = UnischemaField(column_name, np.uint8,
                                              column_array_dimensions,
                                              NdarrayCodec(),
                                              column_is_nullable)
        else:
            LoggingManager().log("Invalid column type: " + str(column_type),
                                 LoggingLevel.ERROR)

        return petastorm_column
コード例 #4
0
def test_encode_scalar_float():
    codec = ScalarCodec(FloatType())
    expected = np.random.random(()).astype(np.float64)
    field = UnischemaField(name='field_float', numpy_dtype=np.float32, shape=(), codec=codec, nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, float)
    assert expected == encoded
コード例 #5
0
def test_create_schema_view_using_unischema_fields():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view([TestSchema.int_field])
    assert set(view.fields.keys()) == {'int_field'}
コード例 #6
0
def test_make_named_tuple():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_scalar', np.string_,
                       (), ScalarCodec(StringType()), True),
        UnischemaField('int32_scalar', np.int32,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('uint8_scalar', np.uint8,
                       (), ScalarCodec(ShortType()), False),
        UnischemaField('int32_matrix', np.float32,
                       (10, 20, 3), NdarrayCodec(), True),
        UnischemaField('decimal_scalar', Decimal,
                       (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False),
    ])

    TestSchema.make_namedtuple(string_scalar='abc',
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=np.int32((10, 20, 3)),
                               decimal_scalar=Decimal(123) / Decimal(10))

    TestSchema.make_namedtuple(string_scalar=None,
                               int32_scalar=10,
                               uint8_scalar=20,
                               int32_matrix=None,
                               decimal_scalar=Decimal(123) / Decimal(10))
コード例 #7
0
def test_create_schema_view_no_field_matches_regex():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view(['bogus'])
    assert not view.fields
コード例 #8
0
def test_insert_explicit_nulls():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('nullable', np.int32, (), ScalarCodec(StringType()), True),
        UnischemaField('not_nullable', np.int32, (), ScalarCodec(ShortType()), False),
    ])

    # Insert_explicit_nulls to leave the dictionary as is.
    row_dict = {'nullable': 0, 'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] == 0
    assert row_dict['not_nullable'] == 1

    # Insert_explicit_nulls to leave the dictionary as is.
    row_dict = {'nullable': None, 'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] is None
    assert row_dict['not_nullable'] == 1

    # We are missing a nullable field here. insert_explicit_nulls should add a None entry.
    row_dict = {'not_nullable': 1}
    insert_explicit_nulls(TestSchema, row_dict)
    assert len(row_dict) == 2
    assert row_dict['nullable'] is None
    assert row_dict['not_nullable'] == 1

    # We are missing a not_nullable field here. Should raise an ValueError.
    row_dict = {'nullable': 0}
    with pytest.raises(ValueError):
        insert_explicit_nulls(TestSchema, row_dict)
コード例 #9
0
def test_encode_scalar_string():
    codec = ScalarCodec(StringType())
    expected = 'surprise'
    field = UnischemaField(name='field_string', numpy_dtype=np.unicode_, shape=(), codec=codec, nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, str)
    assert expected == encoded
コード例 #10
0
def test_predicate_on_dataset(tmpdir):
  TestSchema = Unischema('TestSchema', [
    UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
    UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False),
  ])

  def test_row_generator(x):
    """Returns a single entry in the generated dataset."""
    return {'id': x,
            'test_field': x * x}

  blocklet_size_mb = 256
  dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

  spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
  sc = spark.sparkContext

  rows_count = 10
  with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb):
    rows_rdd = sc.parallelize(range(rows_count)) \
      .map(test_row_generator) \
      .map(lambda x: dict_to_spark_row(TestSchema, x))

    spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
      .write \
      .save(path=dataset_url, format='carbon')

  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
    assert next(reader).id == 3
  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader:
    with pytest.raises(StopIteration):
      # Predicate should have selected none, so a StopIteration should be raised.
      next(reader)
コード例 #11
0
 def test_create_schema_view_using_regex_and_unischema_fields(self):
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
         UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False),
     ])
     view = TestSchema.create_schema_view(['int.*$', TestSchema.string_field])
     self.assertEqual(set(view.fields.keys()), {'int_field', 'string_field'})
コード例 #12
0
def test_create_schema_view_using_invalid_type():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='must be either a string'):
        TestSchema.create_schema_view([42])
コード例 #13
0
def test_create_schema_view_fails_validate():
    """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    with pytest.raises(ValueError, match='does not belong to the schema'):
        TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
コード例 #14
0
def test_dict_to_spark_row_field_validation_scalar_nullable():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
コード例 #15
0
def test_create_schema_view_using_regex_and_unischema_fields_with_duplicates():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])
    view = TestSchema.create_schema_view(['int.*$', TestSchema.int_field])
    assert set(view.fields.keys()) == {'int_field'}
コード例 #16
0
 def test_create_schema_view_fails_validate(self):
     """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
     ])
     with self.assertRaises(ValueError) as ex:
         TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
     self.assertTrue('does not belong to the schema' in str(ex.exception))
コード例 #17
0
def test_bad_unischema_field_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(1, ),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='must be an empty tuple'):
        codec.encode(field, np.int32(1))
コード例 #18
0
def test_bad_encoded_data_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError):
        codec.decode(field, codec.encode(field, np.asarray([10, 10])))
コード例 #19
0
def test_encode_non_scalar_type_is_passed(non_scalar_value):
    codec = ScalarCodec(FloatType())
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError, match='Expected a scalar'):
        codec.encode(field, non_scalar_value)
コード例 #20
0
 def test_create_schema_view_using_invalid_type(self):
     """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
     TestSchema = Unischema('TestSchema', [
         UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
         UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
     ])
     with self.assertRaises(ValueError) as ex:
         TestSchema.create_schema_view([42])
     self.assertTrue('must be either a string' in str(ex.exception))
コード例 #21
0
def test_encode_scalar_bool():
    codec = ScalarCodec(BooleanType())
    field = UnischemaField(name='field_bool', numpy_dtype=np.bool, shape=(), codec=codec, nullable=False)

    encoded = codec.encode(field, np.bool_(True))
    assert isinstance(codec.encode(field, encoded), bool)
    assert encoded

    encoded = codec.encode(field, np.bool_(False))
    assert not encoded
コード例 #22
0
def test_scalar_codec_decimal():
    codec = ScalarCodec(DecimalType(4, 3))
    field = UnischemaField(name='field_decimal',
                           numpy_dtype=Decimal,
                           shape=(),
                           codec=codec,
                           nullable=False)

    value = Decimal('123.4567')
    assert codec.decode(field, codec.encode(field, value)) == value
コード例 #23
0
def test_encode_scalar_int():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, np.int32(42))
    assert isinstance(encoded, int)
    assert 42 == encoded
コード例 #24
0
ファイル: test_codec_scalar.py プロジェクト: wxrui/petastorm
def test_numeric_types(spark_numpy_types):
    spark_type, numpy_type = spark_numpy_types

    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int', numpy_dtype=numpy_type, shape=(), codec=codec, nullable=False)

    min_val, max_val = np.iinfo(numpy_type).min, np.iinfo(numpy_type).max

    assert codec.decode(field, codec.encode(field, numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field, numpy_type(max_val))) == max_val
コード例 #25
0
    def test_fields(self):
        """Try using 'fields' getter"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
            UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        ])

        self.assertEqual(len(TestSchema.fields), 2)
        self.assertEqual(TestSchema.fields['int_field'].name, 'int_field')
        self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
コード例 #26
0
def test_fields():
    """Try using 'fields' getter"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert len(TestSchema.fields) == 2
    assert TestSchema.fields['int_field'].name == 'int_field'
    assert TestSchema.fields['string_field'].name == 'string_field'
コード例 #27
0
def main(train: str, test: str, target_train: str, target_test: str):
    # initialise logger
    logger = logging.getLogger(__file__)
    logger.addHandler(logging.StreamHandler())
    logger.setLevel('INFO')

    logger.info('Initialising local spark')
    spark = init_local_spark()

    logger.info('Preparing schema')
    # petastorm schema
    schema = Unischema('data_schema', [
        UnischemaField('time_window', np.str,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False),
        UnischemaField('feature', np.float32,
                       (1, 69), CompressedNdarrayCodec(), False),
        UnischemaField('label', np.str, (), ScalarCodec(StringType()), True),
    ])

    # processing train
    logger.info('Processing train parquet files')
    logger.info('Read parquet')
    train_feature_df = spark.read.parquet(train)

    logger.info('Composing features...')
    train_input = FeatureComposer(spark, train_feature_df).transform(
        remove_malicious=True, remove_null_label=True)

    logger.info('Changing schema...')
    train_input = change_df_schema(spark, schema, train_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, train_input, target_train,
                                       schema)

    logger.info('Train input done')

    # processing test
    logger.info('Processing test parquet files')
    logger.info('Read parquet')
    test_feature_df = spark.read.parquet(test)

    logger.info('Composing features...')
    test_input = FeatureComposer(spark, test_feature_df).transform(
        remove_malicious=False, remove_null_label=True)

    logger.info('Changing schema...')
    test_input = change_df_schema(spark, schema, test_input)

    logger.info('Persisting...')
    save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema)

    logger.info('Test input done')
コード例 #28
0
    def test_as_spark_schema(self):
        """Try using 'as_spark_schema' function"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
            UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        ])

        spark_schema = TestSchema.as_spark_schema()
        self.assertEqual(spark_schema.fields[0].name, 'int_field')
        self.assertEqual(spark_schema.fields[1].name, 'string_field')

        self.assertEqual(TestSchema.fields['int_field'].name, 'int_field')
        self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
コード例 #29
0
def test_invalid_schema_field(synthetic_dataset):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False),
               predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1))

    assert 'bogus_key' in str(e)
コード例 #30
0
    def from_arrow_schema(cls, parquet_dataset):
        """
        Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars
        which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will
        throw an exception.

        :param arrow_schema: :class:`pyarrow.lib.Schema`
        :return: A :class:`Unischema` object.
        """
        meta = parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open)
        arrow_schema = meta.schema.to_arrow_schema()
        unischema_fields = []

        for partition_name in parquet_dataset.partitions.partition_names:
            unischema_fields.append(
                UnischemaField(partition_name, np.str_, (),
                               ScalarCodec(StringType()), False))

        for column_name in arrow_schema.names:
            arrow_field = arrow_schema.field_by_name(column_name)
            field_type = arrow_field.type
            codec, np_type = _numpy_and_codec_from_arrow_type(field_type)

            unischema_fields.append(
                UnischemaField(column_name, np_type, (), codec,
                               arrow_field.nullable))
        return Unischema('inferred_schema', unischema_fields)