Example #1
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
Example #2
0
def test_is_nested_or_struct():
    struct_ex = pa.struct([pa.field('a', pa.int32()),
                           pa.field('b', pa.int8()),
                           pa.field('c', pa.string())])

    assert types.is_struct(struct_ex)
    assert not types.is_struct(pa.list_(pa.int32()))

    assert types.is_nested(struct_ex)
    assert types.is_nested(pa.list_(pa.int32()))
    assert not types.is_nested(pa.int32())
Example #3
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Example #4
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str:
    if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType)
            or types.is_large_string(pyarrowType)
            or types.is_large_unicode(pyarrowType)):
        return 'string'
    if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)):
        return 'bigint'
    if (types.is_binary(pyarrowType)):
        return 'binary'
    if (types.is_boolean(pyarrowType)):
        return 'boolean'
    if (types.is_date(pyarrowType) or types.is_date32(pyarrowType)
            or types.is_date64(pyarrowType)):
        return 'date'
    if (types.is_decimal(pyarrowType)):
        return 'decimal(16,2)'
    if (types.is_float64(pyarrowType)):
        'return double'
    if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)):
        return 'float'
    if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType)
            or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)):
        return 'int'
    if (types.is_map(pyarrowType)):
        return 'map'
    if (types.is_struct(pyarrowType)):
        return 'struct'
    if (types.is_timestamp(pyarrowType)):
        return 'timestamp'
    if (types.is_union(pyarrowType)):
        return 'union'
    return str(pyarrowType)
Example #6
0
    def arrow_to_pandas(self, arrow_column):
        import pyarrow.types as types

        if self._df_for_struct and types.is_struct(arrow_column.type):
            import pandas as pd
            series = [super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(column)
                      .rename(field.name)
                      for column, field in zip(arrow_column.flatten(), arrow_column.type)]
            s = pd.concat(series, axis=1)
        else:
            s = super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(arrow_column)
        return s
Example #7
0
def _from_arrow_type(dt: pa.DataType) -> pt.DataType:
    if is_struct(dt):
        return pt.StructType([
            pt.StructField(
                # field.name, _from_arrow_type(field.type), nullable=field.nullable
                field.name,
                _from_arrow_type(field.type),
                nullable=True,
            ) for field in dt
        ])
    elif is_list(dt):
        if is_timestamp(dt.value_type):
            raise TypeError(  # pragma: no cover
                "Spark: unsupported type in conversion from Arrow: " + str(dt))
        return pt.ArrayType(_from_arrow_type(dt.value_type))
    return from_arrow_type(dt)
Example #8
0
def _traverse(typ, counter):
    if isinstance(typ, Schema) or types.is_struct(typ):
        for field in typ:
            path = (field.name,)
            yield path, next(counter)
            for sub, c in _traverse(field.type, counter):
                yield path + sub, c
    elif _is_map(typ):
        yield from _traverse(typ.value_type, counter)
    elif types.is_list(typ):
        # Skip one index for list type, since this can never be selected
        # directly
        next(counter)
        yield from _traverse(typ.value_type, counter)
    elif types.is_union(typ):
        # Union types not supported, just skip the indexes
        for dtype in typ:
            next(counter)
            for sub_c in _traverse(dtype, counter):
                pass
Example #9
0
File: orc.py Project: rok/arrow
def _traverse(typ, counter):
    if isinstance(typ, Schema) or types.is_struct(typ):
        for field in typ:
            path = (field.name,)
            yield path, next(counter)
            for sub, c in _traverse(field.type, counter):
                yield path + sub, c
    elif _is_map(typ):
        for sub_c in _traverse(typ.value_type, counter):
            yield sub_c
    elif types.is_list(typ):
        # Skip one index for list type, since this can never be selected
        # directly
        next(counter)
        for sub_c in _traverse(typ.value_type, counter):
            yield sub_c
    elif types.is_union(typ):
        # Union types not supported, just skip the indexes
        for dtype in typ:
            next(counter)
            for sub_c in _traverse(dtype, counter):
                pass
Example #10
0
def _is_map(typ):
    return (types.is_list(typ) and
            types.is_struct(typ.value_type) and
            typ.value_type.num_fields == 2 and
            typ.value_type[0].name == 'key' and
            typ.value_type[1].name == 'value')
Example #11
0
File: orc.py Project: rok/arrow
def _is_map(typ):
    return (types.is_list(typ) and
            types.is_struct(typ.value_type) and
            typ.value_type.num_children == 2 and
            typ.value_type[0].name == 'key' and
            typ.value_type[1].name == 'value')