Beispiel #1
0
    def drop(self, labels):
        """
        Make new Index with passed list of labels deleted.

        Parameters
        ----------
        labels : array-like

        Returns
        -------
        dropped : Index

        Examples
        --------
        >>> index = ks.Index([1, 2, 3])
        >>> index
        Int64Index([1, 2, 3], dtype='int64')

        >>> index.drop([1])
        Int64Index([2, 3], dtype='int64')
        """
        if not isinstance(labels, (tuple, list)):
            labels = [labels]
        sdf = self._internal.sdf[~self._internal.index_scols[0].isin(labels)]
        return Index(
            DataFrame(
                _InternalFrame(sdf=sdf,
                               index_map=self._kdf._internal.index_map)))
Beispiel #2
0
    def _apply_as_series_or_frame(self, func):
        """
        Wraps a function that handles Spark column in order
        to support it in both Koalas Series and DataFrame.

        Note that the given `func` name should be same as the API's method name.
        """
        from databricks.koalas import DataFrame
        from databricks.koalas.series import _col
        from databricks.koalas.groupby import SeriesGroupBy

        kdf = self.kdf
        sdf = self.kdf._sdf

        # Here we need to include grouped key as an index, and shift previous index.
        #   [index_column0, index_column1] -> [grouped key, index_column0, index_column1]
        new_index_scols = []
        new_index_map = []
        for groupkey in self._groupkeys:
            new_index_scols.append(
                # NOTE THAT this code intentionally uses `F.col` instead of `scol` in
                # given series. This is because, in case of series, we convert it into
                # DataFrame. So, if the given `groupkeys` is a series, they end up with
                # being a different series.
                F.col(name_like_string(groupkey.name)
                      ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)),
                                  groupkey._internal.column_index[0]))

        for new_index_scol, index_map in zip(kdf._internal.index_scols,
                                             kdf._internal.index_map):
            new_index_scols.append(
                new_index_scol.alias(
                    SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            _, name = index_map
            new_index_map.append(
                (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name))

        applied = []
        for column in kdf.columns:
            applied.append(kdf[column]._with_new_scol(func(
                kdf[column]._scol)).rename(kdf[column].name))

        # Seems like pandas filters out when grouped key is NA.
        cond = self._groupkeys[0]._scol.isNotNull()
        for c in self._groupkeys:
            cond = cond | c._scol.isNotNull()
        sdf = sdf.select(new_index_scols + [c._scol
                                            for c in applied]).filter(cond)

        internal = _InternalFrame(
            sdf=sdf,
            data_columns=[c._internal.data_columns[0] for c in applied],
            index_map=new_index_map)

        ret = DataFrame(internal)
        if isinstance(self._groupby, SeriesGroupBy):
            return _col(ret)
        else:
            return ret
Beispiel #3
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(
                        spark_type, FloatType):
                    stat_exprs.append(
                        sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=data_columns,
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return DataFrame(internal)
Beispiel #4
0
    def drop_duplicates(self):
        """
        Return Index with duplicate values removed.

        Returns
        -------
        deduplicated : Index

        See Also
        --------
        Series.drop_duplicates : Equivalent method on Series.
        DataFrame.drop_duplicates : Equivalent method on DataFrame.

        Examples
        --------
        Generate an pandas.Index with duplicate values.

        >>> idx = ks.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])

        >>> idx.drop_duplicates() # doctest: +SKIP
        Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
        """
        sdf = self._internal.sdf.select(
            self._internal.index_scols).drop_duplicates()
        internal = _InternalFrame(sdf=sdf,
                                  index_map=self._kdf._internal.index_map)
        result = DataFrame(internal).index
        return result
Beispiel #5
0
    def size(self):
        """
        Compute group sizes.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3],
        ...                    'B': [1, 1, 2, 3, 3, 3]},
        ...                   columns=['A', 'B'])
        >>> df
           A  B
        0  1  1
        1  2  1
        2  2  2
        3  3  3
        4  3  3
        5  3  3

        >>> df.groupby('A').size().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        A
        1  1
        2  2
        3  3
        Name: count, dtype: int64

        >>> df.groupby(['A', 'B']).size().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        A  B
        1  1    1
        2  1    1
           2    1
        3  3    3
        Name: count, dtype: int64
        """
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf
        sdf = sdf.groupby(*groupkey_cols).count()
        if (len(self._agg_columns) > 0) and (self._have_agg_columns):
            name = self._agg_columns[0].name
            sdf = sdf.withColumnRenamed('count', name)
        else:
            name = 'count'
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=[name],
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return _col(DataFrame(internal))
Beispiel #6
0
    def dropna(self):
        """
        Return Index or MultiIndex without NA/NaN values

        Examples
        --------

        >>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]],
        ...                   index=['cobra', 'viper', None],
        ...                   columns=['max_speed', 'shield'])
        >>> df
               max_speed  shield
        cobra          1       2
        viper          4       5
        NaN            7       8

        >>> df.index.dropna()
        Index(['cobra', 'viper'], dtype='object')

        Also support for MultiIndex

        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                       [None, 'weight', 'length']],
        ...                      [[0, 1, 1, 1, 1, 1, 2, 2, 2],
        ...                       [0, 1, 1, 0, 1, 2, 1, 1, 2]])
        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None],
        ...               index=midx)
        >>> s
        lama    NaN        45.0
        cow     weight    200.0
                weight      1.2
                NaN        30.0
                weight    250.0
                length      1.5
        falcon  weight    320.0
                weight      1.0
                length      NaN
        Name: 0, dtype: float64

        >>> s.index.dropna()  # doctest: +SKIP
        MultiIndex([(   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'length'),
                    ('falcon', 'weight'),
                    ('falcon', 'weight'),
                    ('falcon', 'length')],
                   )
        """
        kdf = self._kdf.copy()
        sdf = kdf._internal.sdf.select(self._internal.index_scols).dropna()
        internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map)
        kdf = DataFrame(internal)
        return Index(kdf) if type(self) == Index else MultiIndex(kdf)
Beispiel #7
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond, limit = self._select_rows(rows_sel)
        column_index, columns, returns_series = self._select_cols(cols_sel)

        if cond is None and limit is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)
                if limit is not None:
                    if limit >= 0:
                        sdf = sdf.limit(limit)
                    else:
                        sdf = sdf.limit(sdf.count() + limit)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf
Beispiel #8
0
    def drop(self, labels, level=None):
        """
        Make new MultiIndex with passed list of labels deleted

        Parameters
        ----------
        labels : array-like
            Must be a list of tuples
        level : int or level name, default None

        Returns
        -------
        dropped : MultiIndex

        Examples
        --------
        >>> index = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> index # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['a']) # doctest: +SKIP
        MultiIndex([('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        sdf = self._internal.sdf
        index_scols = self._internal.index_scols
        if level is None:
            scol = index_scols[0]
        else:
            scol = index_scols[level] if isinstance(level, int) else sdf[level]
        sdf = sdf[~scol.isin(labels)]
        return MultiIndex(
            DataFrame(
                _InternalFrame(sdf=sdf,
                               index_map=self._kdf._internal.index_map)))
Beispiel #9
0
    def unique(self, level=None):
        """
        Return unique values in the index.
        Be aware the order of unique values might be different than pandas.Index.unique

        :param level: int or str, optional, default is None
        :return: Index without deuplicates

        Examples
        --------
        >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique()
        Int64Index([1, 3], dtype='int64')

        >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=['d', 'e', 'e']).index.unique()
        Index(['e', 'd'], dtype='object')
        """
        if level is not None:
            self._validate_index_level(level)
        sdf = self._kdf._sdf.select(self._scol.alias(self._internal.index_columns[0])).distinct()
        return DataFrame(_InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)).index
Beispiel #10
0
    def _apply(self, func, return_schema):
        index_columns = self._kdf._internal.index_columns
        index_names = self._kdf._internal.index_names
        data_columns = self._kdf._internal.data_columns

        def rename_output(pdf):
            # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index
            #   within each pdf properly. we might have to deduplicate it.
            import pandas as pd

            if len(index_columns) > 0:
                append = False
                for index_field in index_columns:
                    drop = index_field not in data_columns
                    pdf = pdf.set_index(index_field, drop=drop, append=append)
                    append = True
                pdf = pdf[data_columns]

            if len(index_names) > 0:
                if isinstance(pdf.index, pd.MultiIndex):
                    pdf.index.names = index_names
                else:
                    pdf.index.name = index_names[0]

            pdf = func(pdf)
            # For now, just positionally map the column names to given schema's.
            pdf = pdf.rename(
                columns=dict(zip(pdf.columns, return_schema.fieldNames())))
            return pdf

        grouped_map_func = pandas_udf(return_schema,
                                      PandasUDFType.GROUPED_MAP)(rename_output)

        sdf = self._kdf._sdf
        input_groupkeys = [s._scol for s in self._groupkeys]
        sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=return_schema.fieldNames(),
                                  index_map=[])  # index is lost.
        return DataFrame(internal)
Beispiel #11
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to
             aggregate functions (string or list of strings).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]},
        ...                   columns=['A', 'B', 'C'])

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})
        >>> aggregated[['B', 'C']]  # doctest: +NORMALIZE_WHITESPACE
           B      C
        A
        1  1  0.589
        2  3  0.705

        >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']})
        >>> aggregated  # doctest: +NORMALIZE_WHITESPACE
             B
           min  max
        A
        1    1    2
        2    3    4

        """
        if not isinstance(func_or_funcs, dict) or \
                not all(isinstance(key, str) and
                        (isinstance(value, str) or
                         isinstance(value, list) and all(isinstance(v, str) for v in value))
                        for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string or list of strings).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        multi_aggs = any(isinstance(v, list) for v in func_or_funcs.values())
        reordered = []
        data_columns = []
        column_index = []
        for key, value in func_or_funcs.items():
            for aggfunc in [value] if isinstance(value, str) else value:
                data_col = "('{0}', '{1}')".format(
                    key, aggfunc) if multi_aggs else key
                data_columns.append(data_col)
                column_index.append((key, aggfunc))
                if aggfunc == "nunique":
                    reordered.append(
                        F.expr('count(DISTINCT `{0}`) as `{1}`'.format(
                            key, data_col)))
                else:
                    reordered.append(
                        F.expr('{1}(`{0}`) as `{2}`'.format(
                            key, aggfunc, data_col)))
        sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
        internal = _InternalFrame(
            sdf=sdf,
            data_columns=data_columns,
            column_index=column_index if multi_aggs else None,
            index_map=[('__index_level_{}__'.format(i), s.name)
                       for i, s in enumerate(groupkeys)])
        kdf = DataFrame(internal)
        if not self._as_index:
            kdf = kdf.reset_index()
        return kdf
Beispiel #12
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to aggregate functions (string).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]},
        ...                   columns=['A', 'B', 'C'])

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})
        >>> aggregated[['B', 'C']]  # doctest: +NORMALIZE_WHITESPACE
           B      C
        A
        1  1  0.589
        2  3  0.705

        """
        if not isinstance(func_or_funcs, dict) or \
                not all(isinstance(key, str) and isinstance(value, str)
                        for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        reordered = []
        for key, value in func_or_funcs.items():
            if value == "nunique":
                reordered.append(
                    F.expr('count(DISTINCT {0}) as {0}'.format(key)))
            else:
                reordered.append(F.expr('{1}({0}) as {0}'.format(key, value)))
        sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
        internal = _InternalFrame(
            sdf=sdf,
            data_columns=[key for key, _ in func_or_funcs.items()],
            index_map=[('__index_level_{}__'.format(i), s.name)
                       for i, s in enumerate(groupkeys)])
        return DataFrame(internal)
Beispiel #13
0
    def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        For Series

        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts()  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False)  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64

        For Index

        >>> from databricks.koalas.indexes import Index
        >>> idx = Index([3, 1, 2, 3, 4, np.nan])
        >>> idx
        Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')

        >>> idx.value_counts().sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        Name: count, dtype: int64

        **sort**

        With `sort` set to `False`, the result wouldn't be sorted by number of count.

        >>> idx.value_counts(sort=True).sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        Name: count, dtype: int64

        **normalize**

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> idx.value_counts(normalize=True).sort_index()
        1.0    0.2
        2.0    0.2
        3.0    0.4
        4.0    0.2
        Name: count, dtype: float64

        **dropna**

        With `dropna` set to `False` we can also see NaN index values.

        >>> idx.value_counts(dropna=False).sort_index()  # doctest: +SKIP
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        NaN    1
        Name: count, dtype: int64

        For MultiIndex.

        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                       ['speed', 'weight', 'length']],
        ...                      [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                       [1, 1, 1, 1, 1, 2, 1, 2, 2]])
        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        >>> s.index  # doctest: +SKIP
        MultiIndex([(  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'length'),
                    ('falcon', 'weight'),
                    ('falcon', 'length'),
                    ('falcon', 'length')],
                   )

        >>> s.index.value_counts().sort_index()
        (cow, length)       1
        (cow, weight)       2
        (falcon, length)    2
        (falcon, weight)    1
        (lama, weight)      3
        Name: count, dtype: int64

        >>> s.index.value_counts(normalize=True).sort_index()
        (cow, length)       0.111111
        (cow, weight)       0.222222
        (falcon, length)    0.222222
        (falcon, weight)    0.111111
        (lama, weight)      0.333333
        Name: count, dtype: float64

        If Index has name, keep the name up.

        >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas')
        >>> idx.value_counts().sort_index()
        0    3
        1    2
        2    1
        3    1
        Name: koalas, dtype: int64
        """
        from databricks.koalas.series import _col

        if bins is not None:
            raise NotImplementedError("value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._internal._sdf.select(self.spark_column).dropna()
        else:
            sdf_dropna = self._internal._sdf.select(self.spark_column)
        index_name = SPARK_DEFAULT_INDEX_NAME
        column_name = self._internal.data_spark_column_names[0]
        sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col("count"))
            else:
                sdf = sdf.orderBy(F.col("count").desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn("count", F.col("count") / F.lit(sum))

        column_labels = self._internal.column_labels
        if (column_labels[0] is None) or (None in column_labels[0]):
            internal = _InternalFrame(
                spark_frame=sdf,
                index_map=OrderedDict({index_name: None}),
                data_spark_columns=[scol_for(sdf, "count")],
            )
        else:
            internal = _InternalFrame(
                spark_frame=sdf,
                index_map=OrderedDict({index_name: None}),
                column_labels=column_labels,
                data_spark_columns=[scol_for(sdf, "count")],
                column_label_names=self._internal.column_label_names,
            )

        return _col(DataFrame(internal))
Beispiel #14
0
    def _apply(self, func, return_schema, retain_index):
        should_infer_schema = return_schema is None
        input_groupnames = [s.name for s in self._groupkeys]

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = 1000
            pdf = self._kdf.head(limit + 1).to_pandas()
            pdf = pdf.groupby(input_groupnames).apply(func)
            kdf = DataFrame(pdf)
            return_schema = kdf._sdf.schema
            if len(pdf) <= limit:
                return kdf

        index_columns = self._kdf._internal.index_columns
        index_names = self._kdf._internal.index_names
        data_columns = self._kdf._internal.data_columns

        def rename_output(pdf):
            # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index
            #   within each pdf properly. we might have to deduplicate it.
            import pandas as pd

            if len(index_columns) > 0:
                append = False
                for index_field in index_columns:
                    drop = index_field not in data_columns
                    pdf = pdf.set_index(index_field, drop=drop, append=append)
                    append = True
                pdf = pdf[data_columns]

            if len(index_names) > 0:
                if isinstance(pdf.index, pd.MultiIndex):
                    pdf.index.names = index_names
                else:
                    pdf.index.name = index_names[0]

            pdf = func(pdf)

            if retain_index:
                # If schema should be inferred, we don't restore index. Pandas seems restoring
                # the index in some cases.
                # When Spark output type is specified, without executing it, we don't know
                # if we should restore the index or not. For instance, see the example in
                # https://github.com/databricks/koalas/issues/628.

                # TODO: deduplicate this logic with _InternalFrame.from_pandas
                columns = pdf.columns

                index = pdf.index

                index_map = []
                if isinstance(index, pd.MultiIndex):
                    if index.names is None:
                        index_map = [('__index_level_{}__'.format(i), None)
                                     for i in range(len(index.levels))]
                    else:
                        index_map = [('__index_level_{}__'.format(i)
                                      if name is None else name, name)
                                     for i, name in enumerate(index.names)]
                else:
                    index_map = [(index.name if index.name is not None else
                                  '__index_level_0__', index.name)]

                new_index_columns = [
                    index_column for index_column, _ in index_map
                ]
                new_data_columns = [str(col) for col in columns]

                reset_index = pdf.reset_index()
                reset_index.columns = new_index_columns + new_data_columns
                for name, col in reset_index.iteritems():
                    dt = col.dtype
                    if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                        continue
                    reset_index[name] = col.replace({np.nan: None})
                pdf = reset_index

            # Just positionally map the column names to given schema's.
            pdf = pdf.rename(
                columns=dict(zip(pdf.columns, return_schema.fieldNames())))

            return pdf

        grouped_map_func = pandas_udf(return_schema,
                                      PandasUDFType.GROUPED_MAP)(rename_output)

        sdf = self._kdf._sdf
        input_groupkeys = [s._scol for s in self._groupkeys]
        sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)

        if should_infer_schema:
            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.copy(sdf=sdf)
        else:
            # Otherwise, it loses index.
            internal = _InternalFrame(sdf=sdf,
                                      data_columns=return_schema.fieldNames(),
                                      index_map=[])
        return DataFrame(internal)
Beispiel #15
0
    def symmetric_difference(self, other, result_name=None, sort=None):
        """
        Compute the symmetric difference of two Index objects.

        Parameters
        ----------
        other : Index or array-like
        result_name : str
        sort : True or None, default None
            Whether to sort the resulting index.
            * True : Attempt to sort the result.
            * None : Do not sort the result.

        Returns
        -------
        symmetric_difference : Index

        Notes
        -----
        ``symmetric_difference`` contains elements that appear in either
        ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
        ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
        dropped.

        Examples
        --------
        >>> s1 = ks.Series([1, 2, 3, 4], index=[1, 2, 3, 4])
        >>> s2 = ks.Series([1, 2, 3, 4], index=[2, 3, 4, 5])

        >>> s1.index.symmetric_difference(s2.index)
        Int64Index([5, 1], dtype='int64')

        You can set name of result Index.

        >>> s1.index.symmetric_difference(s2.index, result_name='koalas')
        Int64Index([5, 1], dtype='int64', name='koalas')

        You can set sort to `True`, if you want to sort the resulting index.

        >>> s1.index.symmetric_difference(s2.index, sort=True)
        Int64Index([1, 5], dtype='int64')

        You can also use the ``^`` operator:

        >>> s1.index ^ s2.index
        Int64Index([5, 1], dtype='int64')
        """
        if type(self) != type(other):
            raise NotImplementedError(
                "Doesn't support symmetric_difference between Index & MultiIndex for now"
            )

        sdf_self = self._kdf._sdf.select(self._internal.index_scols)
        sdf_other = other._kdf._sdf.select(other._internal.index_scols)

        sdf_symdiff = sdf_self.union(sdf_other) \
                              .subtract(sdf_self.intersect(sdf_other))

        if sort:
            sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols)

        internal = _InternalFrame(sdf=sdf_symdiff,
                                  index_map=self._internal.index_map)
        result = Index(DataFrame(internal))

        if result_name:
            result.name = result_name

        return result
Beispiel #16
0
    def symmetric_difference(self, other, result_name=None, sort=None):
        """
        Compute the symmetric difference of two MultiIndex objects.

        Parameters
        ----------
        other : Index or array-like
        result_name : list
        sort : True or None, default None
            Whether to sort the resulting index.
            * True : Attempt to sort the result.
            * None : Do not sort the result.

        Returns
        -------
        symmetric_difference : MiltiIndex

        Notes
        -----
        ``symmetric_difference`` contains elements that appear in either
        ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
        ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
        dropped.

        Examples
        --------
        >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> midx2 = pd.MultiIndex([['koalas', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> s1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...                index=midx1)
        >>> s2 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...              index=midx2)

        >>> s1.index.symmetric_difference(s2.index)  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can set names of result Index.

        >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b'])  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   names=['a', 'b'])

        You can set sort to `True`, if you want to sort the resulting index.

        >>> s1.index.symmetric_difference(s2.index, sort=True)  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can also use the ``^`` operator:

        >>> s1.index ^ s2.index  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )
        """
        if type(self) != type(other):
            raise NotImplementedError(
                "Doesn't support symmetric_difference between Index & MultiIndex for now"
            )

        sdf_self = self._kdf._sdf.select(self._internal.index_scols)
        sdf_other = other._kdf._sdf.select(other._internal.index_scols)

        sdf_symdiff = sdf_self.union(sdf_other) \
                              .subtract(sdf_self.intersect(sdf_other))

        if sort:
            sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols)

        internal = _InternalFrame(sdf=sdf_symdiff,
                                  index_map=self._internal.index_map)
        result = MultiIndex(DataFrame(internal))

        if result_name:
            result.names = result_name

        return result
Beispiel #17
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.indexes import Index
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".iloc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._kser)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Index):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif (rows_sel.start is not None) or (rows_sel.step is not None):
                raiseNotImplemented("Cannot use start or step with Spark.")
            elif not isinstance(rows_sel.stop, int):
                raise TypeError(
                    "cannot do slice indexing with these indexers [{}] of {}".
                    format(rows_sel.stop, type(rows_sel.stop)))
            elif rows_sel.stop >= 0:
                sdf = sdf.limit(rows_sel.stop)
            else:
                sdf = sdf.limit(sdf.count() + rows_sel.stop)
        else:
            raiseNotImplemented(
                ".iloc requires numeric slice or conditional boolean Index, "
                "got {}".format(rows_sel))

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            columns = [cols_sel._scol]
        elif isinstance(cols_sel, int):
            columns = [self._kdf._internal.column_scols[cols_sel]]
        elif cols_sel is None or cols_sel == slice(None):
            columns = self._kdf._internal.column_scols
        elif isinstance(cols_sel, slice):
            if all(s is None or isinstance(s, int)
                   for s in (cols_sel.start, cols_sel.stop, cols_sel.step)):
                columns = self._kdf._internal.column_scols[cols_sel]
            else:
                not_none = cols_sel.start if cols_sel.start is not None \
                    else cols_sel.stop if cols_sel.stop is not None else cols_sel.step
                raise TypeError(
                    'cannot do slice indexing with these indexers {} of {}'.
                    format(not_none, type(not_none)))
        elif is_list_like(cols_sel):
            if all(isinstance(s, int) for s in cols_sel):
                columns = [
                    self._kdf._internal.scol_for(col)
                    for col in self._kdf.columns[cols_sel]
                ]
            else:
                raise TypeError('cannot perform reduce with flexible type')
        else:
            raise ValueError(
                "Location based indexing can only have [integer, integer slice, "
                "listlike of integers, boolean array] types, got {}".format(
                    cols_sel))

        try:
            sdf = sdf.select(self._kdf._internal.index_scols + columns)
            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._kdf._internal.index_map)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))

        column_index = self._kdf._internal.column_index
        if cols_sel is not None:
            if isinstance(cols_sel, (Series, int)):
                column_index = None
            else:
                column_index = \
                    pd.MultiIndex.from_tuples(self._kdf._internal.column_index)[cols_sel].tolist()

        kdf = DataFrame(kdf._internal.copy(column_index=column_index))
        if cols_sel is not None and isinstance(cols_sel, (Series, int)):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Beispiel #18
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond = self._select_rows(rows_sel)

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise LocIndexer._raiseNotImplemented(
                "Can only select columns either by name or reference or all")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None

        returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column)
        if cols_sel is None:
            column_index = self._internal.column_index
            columns = self._internal.column_scols
        elif isinstance(cols_sel, (str, tuple)):
            if isinstance(cols_sel, str):
                cols_sel = (cols_sel,)
            column_index, columns, returns_series = \
                self._get_from_multiindex_column(cols_sel)
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
            column_index = None
        elif all(isinstance(key, Series) for key in cols_sel):
            columns = [_make_col(key) for key in cols_sel]
            column_index = [key._internal.column_index[0] for key in cols_sel]
        elif all(isinstance(key, spark.Column) for key in cols_sel):
            columns = cols_sel
            column_index = None
        elif (any(isinstance(key, str) for key in cols_sel)
              and any(isinstance(key, tuple) for key in cols_sel)):
            raise TypeError('Expected tuple, got str')
        else:
            if all(isinstance(key, tuple) for key in cols_sel):
                level = self._internal.column_index_level
                if any(len(key) != level for key in cols_sel):
                    raise ValueError('All the key level should be the same as column index level.')

            column_to_index = list(zip(self._internal.data_columns,
                                       self._internal.column_index))
            columns = []
            column_index = []
            for key in cols_sel:
                found = False
                for column, idx in column_to_index:
                    if idx == key or idx[0] == key:
                        columns.append(_make_col(column))
                        column_index.append(idx)
                        found = True
                if not found:
                    raise KeyError("['{}'] not in index".format(key))

        if cond is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf
Beispiel #19
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            data_spark_columns = self._internal.data_spark_columns
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError("Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][
                    list(self._kdf_or_kser.columns)
                ]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel)

            if cond is None and limit is None and returns_series:
                return self._kdf_or_kser._kser_for(column_labels[0])

        if remaining_index is not None:
            index_scols = self._internal.index_spark_columns[-remaining_index:]
            index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:])
        else:
            index_scols = self._internal.index_spark_columns
            index_map = self._internal.index_map

        if len(column_labels) > 0:
            column_labels = column_labels.copy()
            column_labels_level = max(
                len(label) if label is not None else 1 for label in column_labels
            )
            none_column = 0
            for i, label in enumerate(column_labels):
                if label is None:
                    label = (str(none_column),)
                    none_column += 1
                if len(label) < column_labels_level:
                    label = tuple(list(label) + ([""]) * (column_labels_level - len(label)))
                column_labels[i] = label

            if self._internal.column_label_names is None:
                column_label_names = None
            else:
                # Manage column index names
                column_label_names = self._internal.column_label_names[-column_labels_level:]
        else:
            column_label_names = None

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            data_columns = sdf.select(data_spark_columns).columns
            sdf = sdf.select(index_scols + data_spark_columns)
        except AnalysisException:
            raise KeyError(
                "[{}] don't exist in columns".format(
                    [col._jc.toString() for col in data_spark_columns]
                )
            )

        internal = _InternalFrame(
            spark_frame=sdf,
            index_map=index_map,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf
            )
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
Beispiel #20
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._kser)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Series):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            assert len(self._kdf._internal.index_columns) > 0
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self._kdf._internal.index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self._kdf.index.to_series()
                index_data_type = index_column.spark_type
                cond = []
                if start is not None:
                    cond.append(index_column._scol >= F.lit(start).cast(
                        index_data_type))
                if stop is not None:
                    cond.append(
                        index_column._scol <= F.lit(stop).cast(index_data_type)
                    )

                if len(cond) > 0:
                    sdf = sdf.where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            raiseNotImplemented(
                "Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented(
                    "Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                sdf = sdf.where(F.lit(False))
            elif len(self._kdf._internal.index_columns) == 1:
                index_column = self._kdf.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    sdf = sdf.where(index_column._scol == F.lit(
                        rows_sel[0]).cast(index_data_type))
                else:
                    sdf = sdf.where(
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]))
            else:
                raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")

        # make cols_sel a 1-tuple of string if a single string
        column_index = self._kdf._internal.column_index
        if isinstance(cols_sel, str):
            kdf = DataFrame(self._kdf._internal.copy(sdf=sdf))
            return kdf._get_from_multiindex_column((cols_sel, ))
        elif isinstance(cols_sel, Series):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise raiseNotImplemented(
                "Can only select columns either by name or reference or all")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None

        if cols_sel is None:
            columns = self._kdf._internal.column_scols
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
            column_index = None
        elif all(isinstance(key, Series) for key in cols_sel):
            columns = [_make_col(key) for key in cols_sel]
            column_index = [key._internal.column_index[0] for key in cols_sel]
        elif all(isinstance(key, spark.Column) for key in cols_sel):
            columns = cols_sel
            column_index = None
        elif (any(isinstance(key, str) for key in cols_sel)
              and any(isinstance(key, tuple) for key in cols_sel)):
            raise TypeError('Expected tuple, got str')
        else:
            if all(isinstance(key, tuple) for key in cols_sel):
                level = self._kdf._internal.column_index_level
                if any(len(key) != level for key in cols_sel):
                    raise ValueError(
                        'All the key level should be the same as column index level.'
                    )

            column_to_index = list(
                zip(self._kdf._internal.data_columns,
                    self._kdf._internal.column_index))

            columns = []
            column_index = []
            for key in cols_sel:
                found = False
                for column, idx in column_to_index:
                    if idx == key or idx[0] == key:
                        columns.append(_make_col(column))
                        column_index.append(idx)
                        found = True
                if not found:
                    raise KeyError("['{}'] not in index".format(key))

        try:
            sdf = sdf.select(self._kdf._internal.index_scols + columns)
            index_columns = self._kdf._internal.index_columns
            data_columns = [
                column for column in sdf.columns if column not in index_columns
            ]
            column_scols = [scol_for(sdf, col) for col in data_columns]
            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._kdf._internal.index_map,
                                      column_index=column_index,
                                      column_scols=column_scols)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))

        if cols_sel is not None and isinstance(cols_sel, spark.Column):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Beispiel #21
0
    def transform(self, func):
        """
        Apply function column-by-column to the GroupBy object.

        The function passed to `transform` must take a Series as its first
        argument and return a Series. The given function is executed for
        each series in each grouped data.

        While `transform` is a very flexible method, its downside is that
        using it can be quite a bit slower than using more specific methods
        like `agg` or `transform`. Koalas offers a wide range of method that will
        be much faster than using `transform` for their specific purposes, so try to
        use them before reaching for `transform`.

        .. note:: unlike pandas, it is required for ``func`` to specify return type hint.

        .. note:: the series within ``func`` is actually a pandas series. Therefore,
            any pandas APIs within this function is allowed.

        Parameters
        ----------
        func : callable
            A callable that takes a Series as its first argument, and
            returns a Series.

        Returns
        -------
        applied : DataFrame

        See Also
        --------
        aggregate : Apply aggregate function to the GroupBy object.
        Series.apply : Apply a function to a Series.

        Examples
        --------

        >>> df = ks.DataFrame({'A': [0, 0, 1],
        ...                    'B': [1, 2, 3],
        ...                    'C': [4, 6, 5]}, columns=['A', 'B', 'C'])

        >>> g = df.groupby('A')

        Notice that ``g`` has two groups, ``0`` and ``1``.
        Calling `transform` in various ways, we can get different grouping results:
        Below the functions passed to `transform` takes a Series as
        its argument and returns a Series. `transform` applies the function on each series
        in each grouped data, and combine them into a new DataFrame:

        >>> def convert_to_string(x) -> ks.Series[str]:
        ...    return x.apply("a string {}".format)
        >>> g.transform(convert_to_string)  # doctest: +NORMALIZE_WHITESPACE
                    B           C
        0  a string 1  a string 4
        1  a string 2  a string 6
        2  a string 3  a string 5

        >>> def plus_max(x) -> ks.Series[np.int]:
        ...    return x + x.max()
        >>> g.transform(plus_max)  # doctest: +NORMALIZE_WHITESPACE
           B   C
        0  3  10
        1  4  12
        2  6  10
        """
        # TODO: codes here are similar with GroupBy.apply. Needs to deduplicate.
        if not isinstance(func, Callable):
            raise TypeError("%s object is not callable" % type(func))

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        if return_sig is None:
            raise ValueError(
                "Given function must have return type hint; however, not found."
            )

        return_type = _infer_return_type(func).tpe
        input_groupnames = [s.name for s in self._groupkeys]
        data_columns = self._kdf._internal.data_columns
        return_schema = StructType([
            StructField(c, return_type) for c in data_columns
            if c not in input_groupnames
        ])

        index_columns = self._kdf._internal.index_columns
        index_names = self._kdf._internal.index_names
        data_columns = self._kdf._internal.data_columns

        def rename_output(pdf):
            # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index
            #   within each pdf properly. we might have to deduplicate it.
            import pandas as pd

            if len(index_columns) > 0:
                append = False
                for index_field in index_columns:
                    drop = index_field not in data_columns
                    pdf = pdf.set_index(index_field, drop=drop, append=append)
                    append = True
                pdf = pdf[data_columns]

            if len(index_names) > 0:
                if isinstance(pdf.index, pd.MultiIndex):
                    pdf.index.names = index_names
                else:
                    pdf.index.name = index_names[0]

            # pandas GroupBy.transform drops grouping columns.
            pdf = pdf.drop(columns=input_groupnames)
            pdf = pdf.transform(func)
            # Remaps to the original name, positionally.
            pdf = pdf.rename(
                columns=dict(zip(pdf.columns, return_schema.fieldNames())))
            return pdf

        grouped_map_func = pandas_udf(return_schema,
                                      PandasUDFType.GROUPED_MAP)(rename_output)

        sdf = self._kdf._sdf
        input_groupkeys = [s._scol for s in self._groupkeys]
        sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=return_schema.fieldNames(),
                                  index_map=[])  # index is lost.
        return DataFrame(internal)
Beispiel #22
0
    def value_counts(self, sort=None, ascending=None, dropna=True):
        """
        Compute group sizes.

        Parameters
        ----------
        sort : boolean, default None
            Sort by frequencies.
        ascending : boolean, default False
            Sort in ascending order.
        dropna : boolean, default True
            Don't include counts of NaN.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3],
        ...                    'B': [1, 1, 2, 3, 3, 3]},
        ...                   columns=['A', 'B'])
        >>> df
           A  B
        0  1  1
        1  2  1
        2  2  2
        3  3  3
        4  3  3
        5  3  3

        >>> df.groupby('A')['B'].value_counts().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        A  B
        1  1    1
        2  1    1
           2    1
        3  3    3
        Name: B, dtype: int64
        """
        groupkeys = self._groupkeys + self._agg_columns
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf
        agg_column = self._agg_columns[0].name
        sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed(
            'count', agg_column)

        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col(agg_column).asc())
            else:
                sdf = sdf.orderBy(F.col(agg_column).desc())

        internal = _InternalFrame(sdf=sdf,
                                  data_columns=[agg_column],
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return _col(DataFrame(internal))
Beispiel #23
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, tuple):
                if len(key) > 1:
                    raise SparkPandasIndexingError('Too many indexers')
                key = key[0]

            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf['__temp_col__'] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf['__temp_col__']]

            cond, limit = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_index = self._internal.column_index
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf['__temp_col__'] = rows_sel
                return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit = self._select_rows(rows_sel)
            column_index, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(self._internal.copy(
                    scol=column_scols[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(self._internal.index_scols + column_scols)

            if self._internal.column_index_names is None:
                column_index_names = None
            else:
                # Manage column index names
                level = column_index_level(column_index)
                column_index_names = self._internal.column_index_names[-level:]

            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._internal.index_map,
                                      column_index=column_index,
                                      column_index_names=column_index_names)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in column_scols]))

        if returns_series:
            return Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            return kdf
Beispiel #24
0
    def sort_values(self, ascending=True):
        """
        Return a sorted copy of the index.

        .. note:: This method is not supported for pandas when index has NaN value.
                  pandas raises unexpected TypeError, but we support treating NaN
                  as the smallest value.

        Parameters
        ----------
        ascending : bool, default True
            Should the index values be sorted in an ascending order.

        Returns
        -------
        sorted_index : ks.Index or ks.MultiIndex
            Sorted copy of the index.

        See Also
        --------
        Series.sort_values : Sort values of a Series.
        DataFrame.sort_values : Sort values in a DataFrame.

        Examples
        --------
        >>> idx = ks.Index([10, 100, 1, 1000])
        >>> idx
        Int64Index([10, 100, 1, 1000], dtype='int64')

        Sort values in ascending order (default behavior).

        >>> idx.sort_values()
        Int64Index([1, 10, 100, 1000], dtype='int64')

        Sort values in descending order.

        >>> idx.sort_values(ascending=False)
        Int64Index([1000, 100, 10, 1], dtype='int64')

        Support for MultiIndex.

        >>> kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)])
        >>> kidx  # doctest: +SKIP
        MultiIndex([('a', 'x', 1),
                    ('c', 'y', 2),
                    ('b', 'z', 3)],
                   )

        >>> kidx.sort_values()  # doctest: +SKIP
        MultiIndex([('a', 'x', 1),
                    ('b', 'z', 3),
                    ('c', 'y', 2)],
                   )

        >>> kidx.sort_values(ascending=False)  # doctest: +SKIP
        MultiIndex([('c', 'y', 2),
                    ('b', 'z', 3),
                    ('a', 'x', 1)],
                   )
        """
        sdf = self._internal.sdf
        sdf = sdf.orderBy(self._internal.index_scols, ascending=ascending)

        internal = _InternalFrame(sdf=sdf.select(self._internal.index_scols),
                                  index_map=self._kdf._internal.index_map)

        result = DataFrame(internal).index

        return result
Beispiel #25
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        sdf = self._internal.sdf
        cond, limit = self._select_rows(rows_sel)
        if cond is not None:
            sdf = sdf.where(cond)
        if limit is not None:
            if limit >= 0:
                sdf = sdf.limit(limit)
            else:
                sdf = sdf.limit(sdf.count() + limit)

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series) and cols_sel._equals(self._kdf_or_kser):
            columns = cols_sel._internal.column_scols
            column_index = cols_sel._internal.column_index
        elif isinstance(cols_sel, int):
            columns = [self._internal.column_scols[cols_sel]]
            column_index = [self._internal.column_index[cols_sel]]
        elif cols_sel is None or cols_sel == slice(None):
            columns = self._internal.column_scols
            column_index = self._internal.column_index
        elif isinstance(cols_sel, slice):
            if all(s is None or isinstance(s, int)
                   for s in (cols_sel.start, cols_sel.stop, cols_sel.step)):
                columns = self._internal.column_scols[cols_sel]
                column_index = self._internal.column_index[cols_sel]
            else:
                not_none = cols_sel.start if cols_sel.start is not None \
                    else cols_sel.stop if cols_sel.stop is not None else cols_sel.step
                raise TypeError('cannot do slice indexing with these indexers {} of {}'
                                .format(not_none, type(not_none)))
        elif is_list_like(cols_sel):
            if all(isinstance(s, bool) for s in cols_sel):
                cols_sel = [i for i, s in enumerate(cols_sel) if s]
            if all(isinstance(s, int) for s in cols_sel):
                columns = [self._internal.column_scols[s] for s in cols_sel]
                column_index = [self._internal.column_index[s] for s in cols_sel]
            else:
                raise TypeError('cannot perform reduce with flexible type')
        else:
            raise ValueError("Location based indexing can only have [integer, integer slice, "
                             "listlike of integers, boolean array] types, got {}".format(cols_sel))

        try:
            sdf = sdf.select(self._internal.index_scols + columns)
            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._internal.index_map,
                                      column_index=column_index,
                                      column_index_names=self._internal.column_index_names)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'
                           .format([col._jc.toString() for col in columns]))

        if cols_sel is not None and isinstance(cols_sel, (Series, int)):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Beispiel #26
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(
                    self._internal.copy(scol=column_scols[0],
                                        column_labels=[column_labels[0]]),
                    anchor=self._kdf_or_kser,
                )

        if remaining_index is not None:
            index_scols = self._internal.index_scols[-remaining_index:]
            index_map = self._internal.index_map[-remaining_index:]
        else:
            index_scols = self._internal.index_scols
            index_map = self._internal.index_map

        if self._internal.column_label_names is None:
            column_label_names = None
        else:
            # Manage column index names
            level = column_labels_level(column_labels)
            column_label_names = self._internal.column_label_names[-level:]

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(index_scols + column_scols)
        except AnalysisException:
            raise KeyError("[{}] don't exist in columns".format(
                [col._jc.toString() for col in column_scols]))

        internal = _InternalFrame(
            sdf=sdf,
            index_map=index_map,
            column_labels=column_labels,
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
Beispiel #27
0
    def apply(self, func):
        """
        Apply function `func` group-wise and combine the results together.

        The function passed to `apply` must take a DataFrame as its first
        argument and return a DataFrame. `apply` will
        then take care of combining the results back together into a single
        dataframe. `apply` is therefore a highly flexible
        grouping method.

        While `apply` is a very flexible method, its downside is that
        using it can be quite a bit slower than using more specific methods
        like `agg` or `transform`. Koalas offers a wide range of method that will
        be much faster than using `apply` for their specific purposes, so try to
        use them before reaching for `apply`.

        .. note:: unlike pandas, it is required for ``func`` to specify return type hint.

        .. note:: the output column names are `c0, c1, c2 ... cn`. These names
            are positionally mapped to the returned DataFrame in ``func``. See examples below.

        .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore,
            any pandas APIs within this function is allowed.

        Parameters
        ----------
        func : callable
            A callable that takes a DataFrame as its first argument, and
            returns a dataframe.

        Returns
        -------
        applied : DataFrame

        See Also
        --------
        aggregate : Apply aggregate function to the GroupBy object.
        Series.apply : Apply a function to a Series.

        Examples
        --------
        >>> df = ks.DataFrame({'A': 'a a b'.split(),
        ...                    'B': [1, 2, 3],
        ...                    'C': [4, 6, 5]}, columns=['A', 'B', 'C'])
        >>> g = df.groupby('A')

        Notice that ``g`` has two groups, ``a`` and ``b``.
        Calling `apply` in various ways, we can get different grouping results:

        Below the functions passed to `apply` takes a DataFrame as
        its argument and returns a DataFrame. `apply` combines the result for
        each group together into a new DataFrame:

        >>> def pandas_div_sum(x) -> ks.DataFrame[float, float]:
        ...    return x[['B', 'C']] / x[['B', 'C']].sum()
        >>> g.apply(pandas_div_sum)  # doctest: +NORMALIZE_WHITESPACE
                 c0   c1
        0  1.000000  1.0
        1  0.333333  0.4
        2  0.666667  0.6

        >>> def plus_max(x) -> ks.DataFrame[str, np.int, np.int]:
        ...    return x + x.max()
        >>> g.apply(plus_max)  # doctest: +NORMALIZE_WHITESPACE
           c0  c1  c2
        0  bb   6  10
        1  aa   3  10
        2  aa   4  12
        """
        if not isinstance(func, Callable):
            raise TypeError("%s object is not callable" % type(func))

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        if return_sig is None:
            raise ValueError(
                "Given function must have return type hint; however, not found."
            )

        return_schema = _infer_return_type(func).tpe

        index_columns = self._kdf._internal.index_columns
        index_names = self._kdf._internal.index_names
        data_columns = self._kdf._internal.data_columns

        def rename_output(pdf):
            # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index
            #   within each pdf properly. we might have to deduplicate it.
            import pandas as pd

            if len(index_columns) > 0:
                append = False
                for index_field in index_columns:
                    drop = index_field not in data_columns
                    pdf = pdf.set_index(index_field, drop=drop, append=append)
                    append = True
                pdf = pdf[data_columns]

            if len(index_names) > 0:
                if isinstance(pdf.index, pd.MultiIndex):
                    pdf.index.names = index_names
                else:
                    pdf.index.name = index_names[0]

            pdf = func(pdf)
            # For now, just positionally map the column names to given schema's.
            pdf = pdf.rename(
                columns=dict(zip(pdf.columns, return_schema.fieldNames())))
            return pdf

        grouped_map_func = pandas_udf(return_schema,
                                      PandasUDFType.GROUPED_MAP)(rename_output)

        sdf = self._kdf._sdf
        input_groupkeys = [s._scol for s in self._groupkeys]
        sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=return_schema.fieldNames(),
                                  index_map=[])  # index is lost.
        return DataFrame(internal)