Ejemplo n.º 1
0
    def astype(self, dtype):
        """
        Cast a Koalas object to a specified dtype ``dtype``.

        Parameters
        ----------
        dtype : data type
            Use a numpy.dtype or Python type to cast entire pandas object to
            the same type.

        Returns
        -------
        casted : same type as caller

        See Also
        --------
        to_datetime : Convert argument to datetime.

        Examples
        --------
        >>> ser = ks.Series([1, 2], dtype='int32')
        >>> ser
        0    1
        1    2
        Name: 0, dtype: int32

        >>> ser.astype('int64')
        0    1
        1    2
        Name: 0, dtype: int64

        >>> ser.rename("a").to_frame().set_index("a").index.astype('int64')
        Int64Index([1, 2], dtype='int64', name='a')
        """
        from databricks.koalas.typedef import as_spark_type
        spark_type = as_spark_type(dtype)
        if not spark_type:
            raise ValueError("Type {} not understood".format(dtype))
        return self._with_new_scol(self._scol.cast(spark_type))
Ejemplo n.º 2
0
    def astype(self, dtype) -> 'Series':
        """
        Cast a Koalas object to a specified dtype ``dtype``.

        Parameters
        ----------
        dtype : data type
            Use a numpy.dtype or Python type to cast entire pandas object to
            the same type.

        Returns
        -------
        casted : same type as caller

        See Also
        --------
        to_datetime : Convert argument to datetime.

        Examples
        --------
        >>> ser = ks.Series([1, 2], dtype='int32')
        >>> ser
        0    1
        1    2
        Name: 0, dtype: int32

        >>> ser.astype('int64')
        0    1
        1    2
        Name: 0, dtype: int64
        """
        from databricks.koalas.typedef import as_spark_type
        spark_type = as_spark_type(dtype)
        if not spark_type:
            raise ValueError("Type {} not understood".format(dtype))
        return Series(self._scol.cast(spark_type),
                      anchor=self._kdf,
                      index=self._index_map)
Ejemplo n.º 3
0
    def __init__(
        self,
        spark_frame: spark.DataFrame,
        index_map: Optional[Dict[str, Optional[Tuple]]],
        column_labels: Optional[List[Tuple]] = None,
        data_spark_columns: Optional[List[spark.Column]] = None,
        column_label_names: Optional[List[Optional[Tuple[str, ...]]]] = None,
    ) -> None:
        """
        Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and
        index fields and names.

        :param spark_frame: Spark DataFrame to be managed.
        :param index_map: dictionary of string pairs
                           Each pair holds the index field name which exists in Spark fields,
                           and the index name.
        :param column_labels: list of tuples with the same length
                              The multi-level values in the tuples.
        :param data_spark_columns: list of Spark Column
                                   Spark Columns to appear as columns. If spark_column is not None,
                                   this argument is ignored, otherwise if this is None, calculated
                                   from spark_frame.
        :param column_label_names: Names for each of the column index levels.

        See the examples below to refer what each parameter means.

        >>> column_labels = pd.MultiIndex.from_tuples(
        ...     [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"])
        >>> row_index = pd.MultiIndex.from_tuples(
        ...     [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')],
        ...     names=["row_index_a", "row_index_b"])
        >>> kdf = ks.DataFrame(
        ...     [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels)
        >>> kdf.set_index(('a', 'x'), append=True, inplace=True)
        >>> kdf  # doctest: +NORMALIZE_WHITESPACE
        column_labels_a                  a  b
        column_labels_b                  y  z
        row_index_a row_index_b (a, x)
        foo         bar         1       2  3
                                4       5  6
        zoo         bar         7       8  9

        >>> internal = kdf._internal

        >>> internal._sdf.show()  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
        +-----------------+-----------------+------+------+------+...
        |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|...
        +-----------------+-----------------+------+------+------+...
        |              foo|              bar|     1|     2|     3|...
        |              foo|              bar|     4|     5|     6|...
        |              zoo|              bar|     7|     8|     9|...
        +-----------------+-----------------+------+------+------+...

        >>> internal._index_map  # doctest: +NORMALIZE_WHITESPACE
        OrderedDict([('__index_level_0__', ('row_index_a',)),
         ('__index_level_1__', ('row_index_b',)),
         ('(a, x)', ('a', 'x'))])

        >>> internal._column_labels
        [('a', 'y'), ('b', 'z')]

        >>> internal._data_spark_columns
        [Column<b'(a, y)'>, Column<b'(b, z)'>]

        >>> internal._column_label_names
        [('column_labels_a',), ('column_labels_b',)]
        """

        assert isinstance(spark_frame, spark.DataFrame)
        assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming."

        if index_map is None:
            assert not any(
                SPARK_INDEX_NAME_PATTERN.match(name)
                for name in spark_frame.columns
            ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid "
                "index column names [%s]." % SPARK_INDEX_NAME_PATTERN)

            if data_spark_columns is not None:
                spark_frame = spark_frame.select(data_spark_columns)

            # Create default index.
            spark_frame = InternalFrame.attach_default_index(spark_frame)
            index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})

            if data_spark_columns is not None:
                data_spark_columns = [
                    scol_for(spark_frame, col) for col in spark_frame.columns
                    if col != SPARK_DEFAULT_INDEX_NAME
                ]

        if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns:
            spark_frame = spark_frame.withColumn(
                NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id())

        assert isinstance(index_map, OrderedDict), index_map
        assert all(
            isinstance(index_field, str) and (
                index_name is None or (isinstance(index_name, tuple) and all(
                    name is None or as_spark_type(type(name)) is not None
                    for name in index_name)))
            for index_field, index_name in index_map.items()), index_map
        assert data_spark_columns is None or all(
            isinstance(scol, spark.Column) for scol in data_spark_columns)

        self._sdf = spark_frame  # type: spark.DataFrame
        self._index_map = index_map  # type: Dict[str, Optional[Tuple]]

        if data_spark_columns is None:
            index_columns = set(index_column
                                for index_column in self._index_map)
            self._data_spark_columns = [
                scol_for(spark_frame, col) for col in spark_frame.columns
                if col not in index_columns and col not in HIDDEN_COLUMNS
            ]
        else:
            self._data_spark_columns = data_spark_columns

        if column_labels is None:
            self._column_labels = [
                (col, )
                for col in spark_frame.select(self._data_spark_columns).columns
            ]  # type: List[Tuple]
        else:
            assert len(column_labels) == len(self._data_spark_columns), (
                len(column_labels),
                len(self._data_spark_columns),
            )
            if len(column_labels) == 1:
                column_label = column_labels[0]
                assert column_label is None or (isinstance(
                    column_label, tuple) and len(column_label) > 0 and all(
                        label is None or as_spark_type(type(label)) is not None
                        for label in column_label)), column_label
            else:
                assert all(
                    isinstance(column_label, tuple) and len(column_label) > 0
                    and all(
                        label is None or as_spark_type(type(label)) is not None
                        for label in column_label)
                    for column_label in column_labels), column_labels
                assert len(set(len(label)
                               for label in column_labels)) <= 1, column_labels
            self._column_labels = column_labels

        if column_label_names is None:
            self._column_label_names = [None] * column_labels_level(
                self._column_labels)  # type: List[Optional[Tuple[str, ...]]]
        else:
            if len(self._column_labels) > 0:
                assert len(column_label_names) == column_labels_level(
                    self._column_labels), (
                        len(column_label_names),
                        column_labels_level(self._column_labels),
                    )
            else:
                assert len(column_label_names) > 0, len(column_label_names)
            assert all(column_label_name is None or (
                isinstance(column_label_name, tuple) and all(
                    name is None or as_spark_type(type(name)) is not None
                    for name in column_label_name)) for column_label_name in
                       column_label_names), column_label_names
            self._column_label_names = column_label_names
Ejemplo n.º 4
0
 def astype(self, dtype):
     from databricks.koalas.typedef import as_spark_type
     spark_type = as_spark_type(dtype)
     if not spark_type:
         raise ValueError("Type {} not understood".format(dtype))
     return Series(self._scol.cast(spark_type), self._kdf, self._index_info)
Ejemplo n.º 5
0
    def test_as_spark_type(self):
        type_mapper = {
            # binary
            np.character: BinaryType(),
            np.bytes_: BinaryType(),
            np.string_: BinaryType(),
            bytes: BinaryType(),
            # integer
            np.int8: ByteType(),
            np.byte: ByteType(),
            np.int16: ShortType(),
            np.int32: IntegerType(),
            np.int64: LongType(),
            np.int: LongType(),
            int: LongType(),
            # floating
            np.float32: FloatType(),
            np.float: DoubleType(),
            np.float64: DoubleType(),
            float: DoubleType(),
            # string
            np.str: StringType(),
            np.unicode_: StringType(),
            str: StringType(),
            # bool
            np.bool: BooleanType(),
            bool: BooleanType(),
            # datetime
            np.datetime64: TimestampType(),
            datetime.datetime: TimestampType(),
            # DateType
            datetime.date: DateType(),
            # DecimalType
            decimal.Decimal: DecimalType(38, 18),
            # ArrayType
            np.ndarray: ArrayType(StringType()),
            List[bytes]: ArrayType(BinaryType()),
            List[np.character]: ArrayType(BinaryType()),
            List[np.bytes_]: ArrayType(BinaryType()),
            List[np.string_]: ArrayType(BinaryType()),
            List[bool]: ArrayType(BooleanType()),
            List[np.bool]: ArrayType(BooleanType()),
            List[datetime.date]: ArrayType(DateType()),
            List[np.int8]: ArrayType(ByteType()),
            List[np.byte]: ArrayType(ByteType()),
            List[decimal.Decimal]: ArrayType(DecimalType(38, 18)),
            List[float]: ArrayType(DoubleType()),
            List[np.float]: ArrayType(DoubleType()),
            List[np.float64]: ArrayType(DoubleType()),
            List[np.float32]: ArrayType(FloatType()),
            List[np.int32]: ArrayType(IntegerType()),
            List[int]: ArrayType(LongType()),
            List[np.int]: ArrayType(LongType()),
            List[np.int64]: ArrayType(LongType()),
            List[np.int16]: ArrayType(ShortType()),
            List[str]: ArrayType(StringType()),
            List[np.unicode_]: ArrayType(StringType()),
            List[datetime.datetime]: ArrayType(TimestampType()),
            List[np.datetime64]: ArrayType(TimestampType()),
        }

        for numpy_or_python_type, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
Ejemplo n.º 6
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(koalas_dtype(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            koalas_dtype(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            koalas_dtype(np.dtype("object"))