Example #1
0
    def _apply_as_series_or_frame(self, func):
        """
        Wraps a function that handles Spark column in order
        to support it in both Koalas Series and DataFrame.

        Note that the given `func` name should be same as the API's method name.
        """
        from databricks.koalas import DataFrame
        from databricks.koalas.series import _col
        from databricks.koalas.groupby import SeriesGroupBy

        kdf = self.kdf
        sdf = self.kdf._sdf

        # Here we need to include grouped key as an index, and shift previous index.
        #   [index_column0, index_column1] -> [grouped key, index_column0, index_column1]
        new_index_scols = []
        new_index_map = []
        for groupkey in self._groupkeys:
            new_index_scols.append(
                # NOTE THAT this code intentionally uses `F.col` instead of `scol` in
                # given series. This is because, in case of series, we convert it into
                # DataFrame. So, if the given `groupkeys` is a series, they end up with
                # being a different series.
                F.col(name_like_string(groupkey.name)
                      ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)),
                                  groupkey._internal.column_index[0]))

        for new_index_scol, index_map in zip(kdf._internal.index_scols,
                                             kdf._internal.index_map):
            new_index_scols.append(
                new_index_scol.alias(
                    SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            _, name = index_map
            new_index_map.append(
                (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name))

        applied = []
        for column in kdf.columns:
            applied.append(kdf[column]._with_new_scol(func(
                kdf[column]._scol)).rename(kdf[column].name))

        # Seems like pandas filters out when grouped key is NA.
        cond = self._groupkeys[0]._scol.isNotNull()
        for c in self._groupkeys:
            cond = cond | c._scol.isNotNull()
        sdf = sdf.select(new_index_scols + [c._scol
                                            for c in applied]).filter(cond)

        internal = _InternalFrame(
            sdf=sdf,
            data_columns=[c._internal.data_columns[0] for c in applied],
            index_map=new_index_map)

        ret = DataFrame(internal)
        if isinstance(self._groupby, SeriesGroupBy):
            return _col(ret)
        else:
            return ret
Example #2
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_map,
                       OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}))
        self.assert_eq(internal.column_labels, [("a", ), ("b", )])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(
            internal.spark_column_for(("a", ))._jc.equals(sdf["a"]._jc))
        self.assertTrue(
            internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_map,
            OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None),
                         (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]),
        )
        self.assert_eq(internal.column_labels, [("b", )])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(
            internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_map,
            OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None),
                         (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]),
        )
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(
            internal.spark_column_for(
                ("x", "b"))._jc.equals(sdf["(x, b)"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)
Example #3
0
    def intersection(self, other) -> "MultiIndex":
        """
        Form the intersection of two Index objects.

        This returns a new Index with elements common to the index and `other`.

        Parameters
        ----------
        other : Index or array-like

        Returns
        -------
        intersection : MultiIndex

        Examples
        --------
        >>> midx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> midx2 = ks.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        if isinstance(other, Series) or not is_list_like(other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        elif isinstance(other, DataFrame):
            raise ValueError("Index data must be 1-dimensional")
        elif isinstance(other, MultiIndex):
            spark_frame_other = other.to_frame().to_spark()
            keep_name = self.names == other.names
        elif isinstance(other, Index):
            # Always returns an empty MultiIndex if `other` is Index.
            return self.to_frame().head(0).index  # type: ignore
        elif not all(isinstance(item, tuple) for item in other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        else:
            other = MultiIndex.from_tuples(list(other))
            spark_frame_other = other.to_frame().to_spark()
            keep_name = True

        default_name = [
            SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)
        ]
        spark_frame_self = self.to_frame(name=default_name).to_spark()
        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
        if keep_name:
            index_names = self._internal.index_names
        else:
            index_names = None
        internal = InternalFrame(  # TODO: dtypes?
            spark_frame=spark_frame_intersected,
            index_spark_columns=[
                scol_for(spark_frame_intersected, col) for col in default_name
            ],
            index_names=index_names,
        )
        return cast(MultiIndex, DataFrame(internal).index)
Example #4
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.sdf

        self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None)])
        self.assert_eq(internal.column_index, [('a', ), ('b', )])
        self.assert_eq(internal.data_columns, ['a', 'b'])
        self.assertTrue(internal.scol_for(('a',))._jc.equals(sdf['a']._jc))
        self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc))

        self.assert_eq(internal.pandas_df, pdf)

        # multi-index
        pdf.set_index('a', append=True, inplace=True)

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.sdf

        self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None), ('a', ('a',))])
        self.assert_eq(internal.column_index, [('b', )])
        self.assert_eq(internal.data_columns, ['b'])
        self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc))

        self.assert_eq(internal.pandas_df, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([('x', 'b')])

        internal = _InternalFrame.from_pandas(pdf)
        sdf = internal.sdf

        self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None), ('a', ('a',))])
        self.assert_eq(internal.column_index, [('x', 'b')])
        self.assert_eq(internal.data_columns, ['(x, b)'])
        self.assertTrue(internal.scol_for(('x', 'b'))._jc.equals(sdf['(x, b)']._jc))

        self.assert_eq(internal.pandas_df, pdf)
Example #5
0
def combine_frames(this, *args, how="full", preserve_order_column=False):
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from databricks.koalas.config import get_option
    from databricks.koalas.frame import DataFrame
    from databricks.koalas.internal import (
        InternalFrame,
        HIDDEN_COLUMNS,
        NATURAL_ORDER_COLUMN_NAME,
        SPARK_INDEX_NAME_FORMAT,
    )
    from databricks.koalas.series import Series

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            same_anchor(arg, args[0]) for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        assert not same_anchor(
            this, args[0]), "We don't need to combine. All series is in this."
        that = args[0]._kdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        assert not same_anchor(
            this,
            args[0]), "We don't need to combine. `this` and `that` are same."
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or "
                             "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):

        def resolve(internal, side):
            rename = lambda col: "__{}_{}".format(side, col)
            internal = internal.resolved_copy
            sdf = internal.spark_frame
            sdf = internal.spark_frame.select([
                scol_for(sdf, col).alias(rename(col))
                for col in sdf.columns if col not in HIDDEN_COLUMNS
            ] + list(HIDDEN_COLUMNS))
            return internal.copy(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, rename(col))
                    for col in internal.index_spark_column_names
                ],
                data_spark_columns=[
                    scol_for(sdf, rename(col))
                    for col in internal.data_spark_column_names
                ],
            )

        this_internal = resolve(this._internal, "this")
        that_internal = resolve(that._internal, "that")

        this_index_map = list(
            zip(this_internal.index_spark_column_names,
                this_internal.index_names))
        that_index_map = list(
            zip(that_internal.index_spark_column_names,
                that_internal.index_names))
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = list(zip(this_index_map, that_index_map))

        this_sdf = this_internal.spark_frame.alias("this")
        that_sdf = that_internal.spark_frame.alias("that")

        # If the same named index is found, that's used.
        index_column_names = []
        for i, ((this_column, this_name),
                (that_column,
                 that_name)) in enumerate(this_and_that_index_map):
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this_sdf, this_column)
                that_scol = scol_for(that_sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)

                column_name = SPARK_INDEX_NAME_FORMAT(i)
                index_column_names.append(column_name)
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(),
                           this_scol).otherwise(that_scol).alias(column_name))
            else:
                raise ValueError(
                    "Index names must be exactly matched currently.")

        assert len(
            join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this_sdf.join(that_sdf, on=join_scols, how=how)

        if preserve_order_column:
            order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)]
        else:
            order_column = []

        joined_df = joined_df.select(merged_index_scols + [
            scol_for(this_sdf, this_internal.spark_column_name_for(label))
            for label in this_internal.column_labels
        ] + [
            scol_for(that_sdf, that_internal.spark_column_name_for(label))
            for label in that_internal.column_labels
        ] + order_column)

        index_columns = set(index_column_names)
        new_data_columns = [
            col for col in joined_df.columns
            if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME
        ]
        level = max(this_internal.column_labels_level,
                    that_internal.column_labels_level)

        def fill_label(label):
            if label is None:
                return ([""] * (level - 1)) + [None]
            else:
                return ([""] * (level - len(label))) + list(label)

        column_labels = [
            tuple(["this"] + fill_label(label))
            for label in this_internal.column_labels
        ] + [
            tuple(["that"] + fill_label(label))
            for label in that_internal.column_labels
        ]
        column_label_names = ([None] *
                              (1 + level - this_internal.column_labels_level)
                              ) + this_internal.column_label_names
        return DataFrame(
            InternalFrame(
                spark_frame=joined_df,
                index_spark_columns=[
                    scol_for(joined_df, col) for col in index_column_names
                ],
                index_names=this_internal.index_names,
                column_labels=column_labels,
                data_spark_columns=[
                    scol_for(joined_df, col) for col in new_data_columns
                ],
                column_label_names=column_label_names,
            ))
    else:
        raise ValueError(
            "Cannot combine the series or dataframe because it comes from a different dataframe. "
            "In order to allow this operation, enable 'compute.ops_on_diff_frames' option."
        )
Example #6
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [("a",), ("b",)])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(internal.spark_column_for(("a",))._jc.equals(sdf["a"]._jc))
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # non-string column name
        pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf1)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [(0,), (1,)])
        self.assert_eq(internal.data_spark_column_names, ["0", "1"])
        self.assertTrue(internal.spark_column_for((0,))._jc.equals(sdf["0"]._jc))
        self.assertTrue(internal.spark_column_for((1,))._jc.equals(sdf["1"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf1)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("b",)])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(internal.spark_column_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)
Example #7
0
    def attach_id_column(self, id_type: str,
                         column: Union[str, Tuple[str, ...]]) -> "DataFrame":
        """
        Attach a column to be used as identifier of rows similar to the default index.

        See also `Default Index type
        <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        id_type : string
            The id type.

            - 'sequence' : a sequence that increases one by one.

              .. note:: this uses Spark's Window without specifying partition specification.
                  This leads to move all data into single partition in single machine and
                  could cause serious performance degradation.
                  Avoid this method against very large dataset.

            - 'distributed-sequence' : a sequence that increases one by one,
              by group-by and group-map approach in a distributed manner.
            - 'distributed' : a monotonically increasing sequence simply by using PySpark’s
              monotonically_increasing_id function in a fully distributed manner.

        column : string or tuple of string
            The column name.

        Returns
        -------
        DataFrame
            The DataFrame attached the column.

        Examples
        --------
        >>> df = ks.DataFrame({"x": ['a', 'b', 'c']})
        >>> df.koalas.attach_id_column(id_type="sequence", column="id")
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.koalas.attach_id_column(id_type="distributed-sequence", column="id").sort_index()
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.koalas.attach_id_column(id_type="distributed", column="id")
        ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
           x   id
        0  a  ...
        1  b  ...
        2  c  ...

        For multi-index columns:

        >>> df = ks.DataFrame({("x", "y"): ['a', 'b', 'c']})
        >>> df.koalas.attach_id_column(id_type="sequence", column=("id-x", "id-y"))
           x id-x
           y id-y
        0  a    0
        1  b    1
        2  c    2
        """
        from databricks.koalas.frame import DataFrame

        if id_type == "sequence":
            attach_func = InternalFrame.attach_sequence_column
        elif id_type == "distributed-sequence":
            attach_func = InternalFrame.attach_distributed_sequence_column
        elif id_type == "distributed":
            attach_func = InternalFrame.attach_distributed_column
        else:
            raise ValueError(
                "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'"
            )

        if isinstance(column, str):
            column = (column, )
        else:
            assert isinstance(column, tuple), type(column)

        internal = self._kdf._internal

        if len(column) != internal.column_labels_level:
            raise ValueError(
                "The given column `{}` must be the same length as the existing columns."
                .format(column))
        elif column in internal.column_labels:
            raise ValueError("The given column `{}` already exists.".format(
                name_like_string(column)))

        # Make sure the underlying Spark column names are the form of
        # `name_like_string(column_label)`.
        sdf = internal.spark_frame.select([
            scol.alias(SPARK_INDEX_NAME_FORMAT(i))
            for i, scol in enumerate(internal.index_spark_columns)
        ] + [
            scol.alias(name_like_string(label)) for scol, label in zip(
                internal.data_spark_columns, internal.column_labels)
        ])
        sdf = attach_func(sdf, name_like_string(column))

        return DataFrame(
            InternalFrame(
                spark_frame=sdf,
                index_map=OrderedDict([
                    (SPARK_INDEX_NAME_FORMAT(i), name)
                    for i, name in enumerate(internal.index_names)
                ]),
                column_labels=internal.column_labels + [column],
                data_spark_columns=([
                    scol_for(sdf, name_like_string(label))
                    for label in internal.column_labels
                ] + [scol_for(sdf, name_like_string(column))]),
                column_label_names=internal.column_label_names,
            ).resolved_copy)
Example #8
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        For Series

        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts()  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False)  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64

        For Index

        >>> from databricks.koalas.indexes import Index
        >>> idx = Index([3, 1, 2, 3, 4, np.nan])
        >>> idx
        Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')

        >>> idx.value_counts().sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        Name: count, dtype: int64

        **sort**

        With `sort` set to `False`, the result wouldn't be sorted by number of count.

        >>> idx.value_counts(sort=True).sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        Name: count, dtype: int64

        **normalize**

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> idx.value_counts(normalize=True).sort_index()
        1.0    0.2
        2.0    0.2
        3.0    0.4
        4.0    0.2
        Name: count, dtype: float64

        **dropna**

        With `dropna` set to `False` we can also see NaN index values.

        >>> idx.value_counts(dropna=False).sort_index()  # doctest: +SKIP
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        NaN    1
        Name: count, dtype: int64

        For MultiIndex.

        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                       ['speed', 'weight', 'length']],
        ...                      [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                       [1, 1, 1, 1, 1, 2, 1, 2, 2]])
        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        >>> s.index  # doctest: +SKIP
        MultiIndex([(  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'length'),
                    ('falcon', 'weight'),
                    ('falcon', 'length'),
                    ('falcon', 'length')],
                   )

        >>> s.index.value_counts().sort_index()
        (cow, length)       1
        (cow, weight)       2
        (falcon, length)    2
        (falcon, weight)    1
        (lama, weight)      3
        Name: count, dtype: int64

        >>> s.index.value_counts(normalize=True).sort_index()
        (cow, length)       0.111111
        (cow, weight)       0.222222
        (falcon, length)    0.222222
        (falcon, weight)    0.111111
        (lama, weight)      0.333333
        Name: count, dtype: float64

        If Index has name, keep the name up.

        >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas')
        >>> idx.value_counts().sort_index()
        0    3
        1    2
        2    1
        3    1
        Name: koalas, dtype: int64
        """
        from databricks.koalas.series import Series, _col
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._internal._sdf.dropna()
        else:
            sdf_dropna = self._internal._sdf
        index_name = SPARK_INDEX_NAME_FORMAT(0)
        sdf = sdf_dropna.groupby(self._scol.alias(index_name)).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        column_index = self._internal.column_index
        if (column_index[0] is None) or (None in column_index[0]):
            internal = _InternalFrame(sdf=sdf,
                                      index_map=[(index_name, None)],
                                      column_scols=[scol_for(sdf, 'count')])
        else:
            internal = _InternalFrame(
                sdf=sdf,
                index_map=[(index_name, None)],
                column_index=column_index,
                column_scols=[scol_for(sdf, 'count')],
                column_index_names=self._internal.column_index_names)

        return _col(DataFrame(internal))