Ejemplo n.º 1
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
Ejemplo n.º 2
0
    def __getitem__(self, key):
        from pyspark.sql.functions import lit
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Series):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self._kdf._index_columns) == 0:
                raiseNotImplemented(
                    "Cannot use slice for Spark if no index provided.")
            elif len(self._kdf._index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                cond = []
                if start is not None:
                    cond.append(
                        index_column._scol >= lit(start).cast(index_data_type))
                if stop is not None:
                    cond.append(
                        index_column._scol <= lit(stop).cast(index_data_type))

                if len(cond) > 0:
                    sdf = sdf.where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            raiseNotImplemented(
                "Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented(
                    "Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                sdf = sdf.where(lit(False))
            elif len(self._kdf._index_columns) == 1:
                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                if len(rows_sel) == 1:
                    sdf = sdf.where(index_column._scol == lit(
                        rows_sel[0]).cast(index_data_type))
                else:
                    sdf = sdf.where(
                        index_column._scol.isin(
                            [lit(r).cast(index_data_type) for r in rows_sel]))
            else:
                raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        if cols_sel is None:
            columns = [_make_col(c) for c in self._kdf._metadata.data_columns]
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
        else:
            columns = [_make_col(c) for c in cols_sel]
        try:
            kdf = DataFrame(
                sdf.select(self._kdf._metadata.index_columns + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))
        kdf._metadata = self._kdf._metadata.copy(
            data_columns=kdf._metadata.data_columns[-len(columns):])
        if cols_sel is not None and isinstance(cols_sel, spark.Column):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Ejemplo n.º 3
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64
        """
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
Ejemplo n.º 4
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.indexes import Index
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".iloc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Index):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif (rows_sel.start is not None) or (rows_sel.step is not None):
                raiseNotImplemented("Cannot use start or step with Spark.")
            elif not isinstance(rows_sel.stop, int):
                raise TypeError(
                    "cannot do slice indexing with these indexers [{}] of {}".
                    format(rows_sel.stop, type(rows_sel.stop)))
            elif rows_sel.stop >= 0:
                sdf = sdf.limit(rows_sel.stop)
            else:
                sdf = sdf.limit(sdf.count() + rows_sel.stop)
        else:
            raiseNotImplemented(
                ".iloc requires numeric slice or conditional boolean Index, "
                "got {}".format(rows_sel))

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            columns = [_make_col(cols_sel)]
        elif isinstance(cols_sel, int):
            columns = [_make_col(self._kdf.columns[cols_sel])]
        elif cols_sel is None or cols_sel == slice(None):
            columns = [_make_col(col) for col in self._kdf.columns]
        elif isinstance(cols_sel, slice):
            if all(s is None or isinstance(s, int)
                   for s in (cols_sel.start, cols_sel.stop, cols_sel.step)):
                columns = [
                    _make_col(col) for col in self._kdf.columns[cols_sel]
                ]
            else:
                not_none = cols_sel.start if cols_sel.start is not None \
                    else cols_sel.stop if cols_sel.stop is not None else cols_sel.step
                raise TypeError(
                    'cannot do slice indexing with these indexers {} of {}'.
                    format(not_none, type(not_none)))
        elif is_list_like(cols_sel):
            if all(isinstance(s, int) for s in cols_sel):
                columns = [
                    _make_col(col) for col in self._kdf.columns[cols_sel]
                ]
            else:
                raise TypeError('cannot perform reduce with flexible type')
        else:
            raise ValueError(
                "Location based indexing can only have [integer, integer slice, "
                "listlike of integers, boolean array] types, got {}".format(
                    cols_sel))

        try:
            kdf = DataFrame(
                sdf.select(self._kdf._metadata.index_columns + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))
        kdf._metadata = self._kdf._metadata.copy(
            data_columns=kdf._metadata.data_columns[-len(columns):])
        if cols_sel is not None and isinstance(cols_sel, (Series, int)):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Ejemplo n.º 5
0
    def __getitem__(self, key):
        from pyspark.sql.functions import lit
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Series):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self._kdf._index_columns) == 0:
                raiseNotImplemented("Cannot use slice for Spark if no index provided.")
            elif len(self._kdf._index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                cond = []
                if start is not None:
                    cond.append(index_column._scol >= lit(start).cast(index_data_type))
                if stop is not None:
                    cond.append(index_column._scol <= lit(stop).cast(index_data_type))

                if len(cond) > 0:
                    sdf = sdf.where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented("Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            raiseNotImplemented("Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented("Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                sdf = sdf.where(lit(False))
            elif len(self._kdf._index_columns) == 1:
                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                if len(rows_sel) == 1:
                    sdf = sdf.where(
                        index_column._scol == lit(rows_sel[0]).cast(index_data_type))
                else:
                    sdf = sdf.where(index_column._scol.isin(
                        [lit(r).cast(index_data_type) for r in rows_sel]))
            else:
                raiseNotImplemented("Cannot select with MultiIndex with Spark.")
        if cols_sel is None:
            columns = [_make_col(c) for c in self._kdf._metadata.data_columns]
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
        else:
            columns = [_make_col(c) for c in cols_sel]
        try:
            kdf = DataFrame(sdf.select(self._kdf._metadata.index_columns + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'
                           .format([col._jc.toString() for col in columns]))
        kdf._metadata = self._kdf._metadata.copy(
            data_columns=kdf._metadata.data_columns[-len(columns):])
        if cols_sel is not None and isinstance(cols_sel, spark.Column):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf