Ejemplo n.º 1
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Ejemplo n.º 2
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Ejemplo n.º 3
0
    def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None:
        """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back."""
        orig_cat_shape = rt_cat.shape
        if have_nulls:
            # riptable's filtering/masking uses a valid mask (where False means null/NA).
            indices = np.arange(len(rt_cat))
            valid_mask = indices % 3 != 1
            rt_cat = rt_cat.filter(valid_mask)
            assert rt_cat.shape == orig_cat_shape

            # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0.
            filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum()
            assert filtered_element_count == (len(rt_cat) - valid_mask.sum())

        result_pa_arr = rt_cat.to_arrow()

        # Verify the pyarrow array has the correct length, number of categories, etc.
        assert len(rt_cat) == len(result_pa_arr)
        assert pat.is_dictionary(result_pa_arr.type)
        assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \
            "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical."

        if have_nulls:
            assert valid_mask.sum() > 0
            assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count

        # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical.
        #         * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical.
        #         * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped,
        #           which is not the same behavior as for other Categorical modes.
        #         * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array,
        #           like what is supported for other Categorical modes.
        if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary):
            pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.")

        result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable)

        # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be
        # in the same position after being roundtripped, so they should be mapped to the same integer before/after.
        # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them.
        # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in
        #       pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed
        #       we should be able to use the stronger equality check.
        assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)
Ejemplo n.º 4
0
def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())
Ejemplo n.º 5
0
def is_complex(arrow_type: DataType, /) -> bool:
    """Check if data type is complex."""
    return is_dictionary(arrow_type) or is_nested(arrow_type)
def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())
Ejemplo n.º 7
0
def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(),
                      pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())
Ejemplo n.º 8
0
def is_complex(arrow_type):
    return is_dictionary(arrow_type) or is_nested(arrow_type)
Ejemplo n.º 9
0
def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())