Ejemplo n.º 1
0
 def _init_from_spark(self, sdf, metadata=None, *args):
     self._sdf = sdf
     if metadata is None:
         self._metadata = Metadata(
             column_fields=self._sdf.schema.fieldNames())
     else:
         self._metadata = metadata
Ejemplo n.º 2
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
Ejemplo n.º 3
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i))
                         for i, s in enumerate(groupkeys)]
        sdf = self._kdf._sdf

        column_fields = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType):
                    stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    column_fields.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    column_fields.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        metadata = Metadata(column_fields=column_fields,
                            index_info=[('__index_level_{}__'.format(i), s.name)
                                        for i, s in enumerate(groupkeys)])
        return DataFrame(sdf, metadata)
Ejemplo n.º 4
0
 def from_pandas(self, pdf):
     if isinstance(pdf, pd.Series):
         return _col(self.from_pandas(pd.DataFrame(pdf)))
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     df = self.createDataFrame(reset_index)
     df._metadata = metadata
     return df
Ejemplo n.º 5
0
 def _init_from_pandas(self, pdf, *args):
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     schema = StructType([StructField(name, infer_pd_series_spark_type(col),
                                      nullable=bool(col.isnull().any()))
                          for name, col in reset_index.iteritems()])
     for name, col in reset_index.iteritems():
         dt = col.dtype
         if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
             continue
         reset_index[name] = col.replace({np.nan: None})
     self._init_from_spark(default_session().createDataFrame(reset_index, schema=schema),
                           metadata)
Ejemplo n.º 6
0
def from_pandas(pdf):
    """Create DataFrame from pandas DataFrame.

    This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks
    the index in the given pandas DataFrame.

    :param pdf: :class:`pandas.DataFrame`
    """
    if isinstance(pdf, pd.Series):
        return _col(from_pandas(pd.DataFrame(pdf)))
    metadata = Metadata.from_pandas(pdf)
    reset_index = pdf.reset_index()
    reset_index.columns = metadata.all_fields
    df = default_session().createDataFrame(reset_index)
    df._metadata = metadata
    return df
Ejemplo n.º 7
0
    def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
        if bins is not None:
            raise NotImplementedError("value_counts currently does not support bins")

        if dropna:
            df_dropna = self._pandas_anchor._spark_filter(self.notna())
        else:
            df_dropna = self._pandas_anchor
        df = df_dropna._spark_groupby(self).count()
        if sort:
            if ascending:
                df = df._spark_orderBy(F._spark_col('count'))
            else:
                df = df._spark_orderBy(F._spark_col('count')._spark_desc())

        if normalize:
            sum = df_dropna._spark_count()
            df = df._spark_withColumn('count', F._spark_col('count') / F._spark_lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        df.columns = [index_name, self.name]
        df._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)])
        return _col(df)
Ejemplo n.º 8
0
 def to_dataframe(self):
     sdf = self._kdf._sdf.select([field for field, _ in self._index_info] + [self._scol])
     metadata = Metadata(column_fields=[sdf.schema[-1].name], index_info=self._index_info)
     return DataFrame(sdf, metadata)
Ejemplo n.º 9
0
 def _init_from_pandas(self, pdf, *args):
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     self._init_from_spark(default_session().createDataFrame(reset_index),
                           metadata)
Ejemplo n.º 10
0
class DataFrame(_Frame):
    """
    Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame
    internally.

    :ivar _sdf: Spark Column instance
    :ivar _metadata: Metadata related to column names and index information.
    """
    @derived_from(pd.DataFrame)
    @dispatch_on('data')
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 dtype=None,
                 copy=False):
        pdf = pd.DataFrame(data=data,
                           index=index,
                           columns=columns,
                           dtype=dtype,
                           copy=copy)
        self._init_from_pandas(pdf)

    @__init__.register(pd.DataFrame)
    def _init_from_pandas(self, pdf, *args):
        metadata = Metadata.from_pandas(pdf)
        reset_index = pdf.reset_index()
        reset_index.columns = metadata.all_fields
        self._init_from_spark(default_session().createDataFrame(reset_index),
                              metadata)

    @__init__.register(spark.DataFrame)
    def _init_from_spark(self, sdf, metadata=None, *args):
        self._sdf = sdf
        if metadata is None:
            self._metadata = Metadata(
                column_fields=self._sdf.schema.fieldNames())
        else:
            self._metadata = metadata

    @property
    def _index_columns(self):
        return [
            self._sdf.__getitem__(field)
            for field in self._metadata.index_fields
        ]

    def _reduce_for_stat_function(self, sfun):
        sdf = self._sdf.select(
            [sfun(self._sdf[col]).alias(col) for col in self.columns])
        pdf = sdf.toPandas()
        assert len(pdf) == 1, (sdf, pdf)
        row = pdf.iloc[0]
        row.name = None
        return row  # Return first row as a Series

    def corr(self, method='pearson'):
        """
        Compute pairwise correlation of columns, excluding NA/null values.

        Parameters
        ----------
        method : {'pearson', 'spearman'}
            * pearson : standard correlation coefficient
            * spearman : Spearman rank correlation

        Returns
        -------
        y : pandas.DataFrame

        See Also
        --------
        Series.corr

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'])
        >>> df.corr('pearson')
                  dogs      cats
        dogs  1.000000 -0.851064
        cats -0.851064  1.000000

        >>> df.corr('spearman')
                  dogs      cats
        dogs  1.000000 -0.948683
        cats -0.948683  1.000000

        Notes
        -----
        There are behavior differences between Koalas and pandas.

        * the `method` argument only accepts 'pearson', 'spearman'
        * the data should not contain NaNs. Koalas will return an error.
        * Koalas doesn't support the following argument(s).

          * `min_periods` argument is not supported
        """
        return corr(self, method)

    @derived_from(pd.DataFrame)
    def iteritems(self):
        cols = list(self.columns)
        return list((col_name, self[col_name]) for col_name in cols)

    @derived_from(pd.DataFrame)
    def to_html(self,
                buf=None,
                columns=None,
                col_space=None,
                header=True,
                index=True,
                na_rep='NaN',
                formatters=None,
                float_format=None,
                sparsify=None,
                index_names=True,
                justify=None,
                max_rows=None,
                max_cols=None,
                show_dimensions=False,
                decimal='.',
                bold_rows=True,
                classes=None,
                escape=True,
                notebook=False,
                border=None,
                table_id=None,
                render_links=False):
        return self.toPandas().to_html(buf=buf,
                                       columns=columns,
                                       col_space=col_space,
                                       header=header,
                                       index=index,
                                       na_rep=na_rep,
                                       formatters=formatters,
                                       float_format=float_format,
                                       sparsify=sparsify,
                                       index_names=index_names,
                                       justify=justify,
                                       max_rows=max_rows,
                                       max_cols=max_cols,
                                       show_dimensions=show_dimensions,
                                       decimal=decimal,
                                       bold_rows=bold_rows,
                                       classes=classes,
                                       escape=escape,
                                       notebook=notebook,
                                       border=border,
                                       table_id=table_id,
                                       render_links=render_links)

    @property
    def index(self):
        """The index (row labels) Column of the DataFrame.

        Currently supported only when the DataFrame has a single index.
        """
        from databricks.koalas.series import Series
        if len(self._metadata.index_info) != 1:
            raise KeyError(
                'Currently supported only when the DataFrame has a single index.'
            )
        return Series(self._index_columns[0], self, [])

    def set_index(self, keys, drop=True, append=False, inplace=False):
        """Set the DataFrame index (row labels) using one or more existing columns. By default
        yields a new object.

        :param keys: column label or list of column labels / arrays
        :param drop: boolean, default True
                     Delete columns to be used as the new index
        :param append: boolean, default False
                       Whether to append columns to existing index
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if isinstance(keys, string_types):
            keys = [keys]
        else:
            keys = list(keys)
        for key in keys:
            if key not in self.columns:
                raise KeyError(key)

        if drop:
            columns = [
                column for column in self._metadata.column_fields
                if column not in keys
            ]
        else:
            columns = self._metadata.column_fields
        if append:
            index_info = self._metadata.index_info + [(column, column)
                                                      for column in keys]
        else:
            index_info = [(column, column) for column in keys]

        metadata = self._metadata.copy(column_fields=columns,
                                       index_info=index_info)
        if inplace:
            self._metadata = metadata
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            return kdf

    def reset_index(self, level=None, drop=False, inplace=False):
        """For DataFrame with multi-level index, return new DataFrame with labeling information in
        the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None.
        For a standard index, the index name will be used (if set), otherwise a default 'index' or
        'level_0' (if 'index' is already taken) will be used.

        :param level: int, str, tuple, or list, default None
                      Only remove the given levels from the index. Removes all levels by default
        :param drop: boolean, default False
                     Do not try to insert index into dataframe columns. This resets the index to the
                     default integer index.
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if len(self._metadata.index_info) == 0:
            raise NotImplementedError(
                'Can\'t reset index because there is no index.')

        multi_index = len(self._metadata.index_info) > 1
        if multi_index:
            rename = lambda i: 'level_{}'.format(i)
        else:
            rename = lambda i: \
                'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i)

        if level is None:
            index_columns = [
                (column, name if name is not None else rename(i))
                for i, (column, name) in enumerate(self._metadata.index_info)
            ]
            index_info = []
        else:
            if isinstance(level, (int, string_types)):
                level = [level]
            level = list(level)

            if all(isinstance(l, int) for l in level):
                for l in level:
                    if l >= len(self._metadata.index_info):
                        raise IndexError(
                            'Too many levels: Index has only {} level, not {}'.
                            format(len(self._metadata.index_info), l + 1))
                idx = level
            elif all(isinstance(l, string_types) for l in level):
                idx = []
                for l in level:
                    try:
                        i = self._metadata.index_fields.index(l)
                        idx.append(i)
                    except ValueError:
                        if multi_index:
                            raise KeyError('Level unknown not found')
                        else:
                            raise KeyError(
                                'Level unknown must be same as name ({})'.
                                format(self._metadata.index_fields[0]))
            else:
                raise ValueError('Level should be all int or all string.')
            idx.sort()

            index_columns = []
            index_info = self._metadata.index_info.copy()
            for i in idx:
                info = self._metadata.index_info[i]
                column_field, index_name = info
                index_columns.append((column_field, index_name if index_name
                                      is not None else rename(index_name)))
                index_info.remove(info)

        if drop:
            index_columns = []

        metadata = self._metadata.copy(
            column_fields=[column for column, _ in index_columns] +
            self._metadata.column_fields,
            index_info=index_info)
        columns = [name
                   for _, name in index_columns] + self._metadata.column_fields
        if inplace:
            self._metadata = metadata
            self.columns = columns
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            kdf.columns = columns
            return kdf

    @derived_from(pd.DataFrame)
    def isnull(self):
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.isnull()
        return kdf

    isna = isnull

    @derived_from(pd.DataFrame)
    def notnull(self):
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.notnull()
        return kdf

    notna = notnull

    @derived_from(spark.DataFrame)
    def toPandas(self):
        sdf = self._sdf.select(
            ['`{}`'.format(name) for name in self._metadata.all_fields])
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            # TODO: push to OSS
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })
        if len(self._metadata.index_info) > 0:
            append = False
            for index_field in self._metadata.index_fields:
                drop = index_field not in self._metadata.column_fields
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[self._metadata.column_fields]
        index_names = self._metadata.index_names
        if len(index_names) > 0:
            if isinstance(pdf.index, pd.MultiIndex):
                pdf.index.names = index_names
            else:
                pdf.index.name = index_names[0]
        return pdf

    @derived_from(pd.DataFrame)
    def assign(self, **kwargs):
        from databricks.koalas.series import Series
        for k, v in kwargs.items():
            if not (isinstance(v, (Series, spark.Column)) or callable(v)
                    or pd.api.types.is_scalar(v)):
                raise TypeError("Column assignment doesn't support type "
                                "{0}".format(type(v).__name__))
            if callable(v):
                kwargs[k] = v(self)

        pairs = list(kwargs.items())
        sdf = self._sdf
        for (name, c) in pairs:
            if isinstance(c, Series):
                sdf = sdf.withColumn(name, c._scol)
            else:
                sdf = sdf.withColumn(name, c)

        metadata = self._metadata.copy(
            column_fields=(self._metadata.column_fields + [
                name for name, _ in pairs
                if name not in self._metadata.column_fields
            ]))
        return DataFrame(sdf, metadata)

    @property
    def loc(self):
        return SparkDataFrameLocator(self)

    def copy(self):
        return DataFrame(self._sdf, self._metadata.copy())

    @derived_from(pd.DataFrame)
    def dropna(self,
               axis=0,
               how='any',
               thresh=None,
               subset=None,
               inplace=False):
        if axis == 0 or axis == 'index':
            if subset is not None:
                if isinstance(subset, string_types):
                    columns = [subset]
                else:
                    columns = list(subset)
                invalids = [
                    column for column in columns
                    if column not in self._metadata.column_fields
                ]
                if len(invalids) > 0:
                    raise KeyError(invalids)
            else:
                columns = list(self.columns)

            cnt = reduce(lambda x, y: x + y, [
                F.when(self[column].notna()._scol, 1).otherwise(0)
                for column in columns
            ], F.lit(0))
            if thresh is not None:
                pred = cnt >= F.lit(int(thresh))
            elif how == 'any':
                pred = cnt == F.lit(len(columns))
            elif how == 'all':
                pred = cnt > F.lit(0)
            else:
                if how is not None:
                    raise ValueError('invalid how option: {h}'.format(h=how))
                else:
                    raise TypeError('must specify how or thresh')

            sdf = self._sdf.filter(pred)
            if inplace:
                self._sdf = sdf
            else:
                return DataFrame(sdf, self._metadata.copy())

        else:
            raise NotImplementedError(
                "dropna currently only works for axis=0 or axis='index'")

    def head(self, n=5):
        """
        Return the first `n` rows.

        This function returns the first `n` rows for the object based
        on position. It is useful for quickly testing if your object
        has the right type of data in it.

        Parameters
        ----------
        n : int, default 5
            Number of rows to select.

        Returns
        -------
        obj_head : same type as caller
            The first `n` rows of the caller object.

        Examples
        --------
        >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
        ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
        >>> df
              animal
        0  alligator
        1        bee
        2     falcon
        3       lion
        4     monkey
        5     parrot
        6      shark
        7      whale
        8      zebra

        Viewing the first 5 lines

        >>> df.head()
              animal
        0  alligator
        1        bee
        2     falcon
        3       lion
        4     monkey

        Viewing the first `n` lines (three in this case)

        >>> df.head(3)
              animal
        0  alligator
        1        bee
        2     falcon
        """

        return DataFrame(self._sdf.limit(n), self._metadata.copy())

    @property
    def columns(self):
        """The column labels of the DataFrame."""
        return pd.Index(self._metadata.column_fields)

    @columns.setter
    def columns(self, names):
        old_names = self._metadata.column_fields
        if len(old_names) != len(names):
            raise ValueError(
                "Length mismatch: Expected axis has %d elements, new values have %d elements"
                % (len(old_names), len(names)))
        sdf = self._sdf.select(self._metadata.index_fields + [
            self[old_name]._scol.alias(new_name)
            for (old_name, new_name) in zip(old_names, names)
        ])
        self._sdf = sdf
        self._metadata = self._metadata.copy(column_fields=names)

    @property
    def dtypes(self):
        """Return the dtypes in the DataFrame.

        This returns a Series with the data type of each column. The result's index is the original
        DataFrame's columns. Columns with mixed types are stored with the object dtype.

        :return: :class:`pd.Series` The data type of each column.

        Examples
        --------
        >>> df = ks.DataFrame({'a': list('abc'),
        ...                    'b': list(range(1, 4)),
        ...                    'c': np.arange(3, 6).astype('i1'),
        ...                    'd': np.arange(4.0, 7.0, dtype='float64'),
        ...                    'e': [True, False, True],
        ...                    'f': pd.date_range('20130101', periods=3)})
        >>> df.dtypes
        a            object
        b             int64
        c             int64
        d           float64
        e              bool
        f    datetime64[ns]
        dtype: object
        """
        return pd.Series(
            [self[col].dtype for col in self._metadata.column_fields],
            index=self._metadata.column_fields)

    def count(self):
        """
        Count non-NA cells for each column or row.

        The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
        on `pandas.options.mode.use_inf_as_na`) are considered NA.

        Returns
        -------
        Series or DataFrame
            For each column/row the number of non-NA/null entries.
            If `level` is specified returns a `DataFrame`.

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.
        DataFrame.shape: Number of DataFrame rows and columns (including NA
            elements).
        DataFrame.isna: Boolean same-sized DataFrame showing places of NA
            elements.

        Examples
        --------
        Constructing DataFrame from a dictionary:

        >>> df = pd.DataFrame({"Person":
        ...                    ["John", "Myla", "Lewis", "John", "Myla"],
        ...                    "Age": [24., np.nan, 21., 33, 26],
        ...                    "Single": [False, True, True, True, False]})
        >>> df
          Person   Age  Single
        0   John  24.0   False
        1   Myla   NaN    True
        2  Lewis  21.0    True
        3   John  33.0    True
        4   Myla  26.0   False

        Notice the uncounted NA values:

        >>> df.count()
        Person    5
        Age       4
        Single    5
        dtype: int64

        Counts for each **row**:
        """
        return self._sdf.count()

    def unique(self):
        sdf = self._sdf
        return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx),
                         self._metadata.copy())

    @derived_from(pd.DataFrame)
    def drop(self, labels, axis=0, errors='raise'):
        axis = self._validate_axis(axis)
        if axis == 1:
            if isinstance(labels, list):
                sdf = self._sdf.drop(*labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column not in labels
                ])
            else:
                sdf = self._sdf.drop(labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column != labels
                ])
            return DataFrame(sdf, metadata)
        raise NotImplementedError("Drop currently only works for axis=1")

    @derived_from(pd.DataFrame)
    def get(self, key, default=None):
        try:
            return self._pd_getitem(key)
        except (KeyError, ValueError, IndexError):
            return default

    def sort_values(self, by):
        return DataFrame(self._sdf.sort(by), self._metadata.copy())

    def groupby(self, by):
        from databricks.koalas.groups import PandasLikeGroupBy
        gp = self._sdf.groupby(by)
        return PandasLikeGroupBy(self, gp, None)

    @derived_from(pd.DataFrame)
    def pipe(self, func, *args, **kwargs):
        # Taken from pandas:
        # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707
        if isinstance(func, tuple):
            func, target = func
            if target in kwargs:
                raise ValueError('%s is both the pipe target and a keyword '
                                 'argument' % target)
            kwargs[target] = self
            return func(*args, **kwargs)
        else:
            return func(self, *args, **kwargs)

    @property
    def shape(self):
        """
        Return a tuple representing the dimensionality of the DataFrame.

        Examples
        --------
        >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
        >>> df.shape
        (2, 2)

        >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4],
        ...                    'col3': [5, 6]})
        >>> df.shape
        (2, 3)
        """
        return len(self), len(self.columns)

    def _pd_getitem(self, key):
        from databricks.koalas.series import Series
        if key is None:
            raise KeyError("none key")
        if isinstance(key, string_types):
            try:
                return Series(self._sdf.__getitem__(key), self,
                              self._metadata.index_info)
            except AnalysisException:
                raise KeyError(key)
        if np.isscalar(key) or isinstance(key, (tuple, string_types)):
            raise NotImplementedError(key)
        elif isinstance(key, slice):
            return self.loc[key]

        if isinstance(key, (pd.Series, np.ndarray, pd.Index)):
            raise NotImplementedError(key)
        if isinstance(key, list):
            return self.loc[:, key]
        if isinstance(key, DataFrame):
            # TODO Should not implement alignment, too dangerous?
            return Series(self._sdf.__getitem__(key), self,
                          self._metadata.index_info)
        if isinstance(key, Series):
            # TODO Should not implement alignment, too dangerous?
            # It is assumed to be only a filter, otherwise .loc should be used.
            bcol = key._scol.cast("boolean")
            return DataFrame(self._sdf.filter(bcol), self._metadata.copy())
        raise NotImplementedError(key)

    def __repr__(self):
        return repr(self.toPandas())

    def __getitem__(self, key):
        return self._pd_getitem(key)

    def __setitem__(self, key, value):
        from databricks.koalas.series import Series
        # For now, we don't support realignment against different dataframes.
        # This is too expensive in Spark.
        # Are we assigning against a column?
        if isinstance(value, Series):
            assert value._kdf is self, \
                "Cannot combine column argument because it comes from a different dataframe"
        if isinstance(key, (tuple, list)):
            assert isinstance(value.schema, StructType)
            field_names = value.schema.fieldNames()
            kdf = self.assign(
                **{k: value[c]
                   for k, c in zip(key, field_names)})
        else:
            kdf = self.assign(**{key: value})

        self._sdf = kdf._sdf
        self._metadata = kdf._metadata

    def __getattr__(self, key):
        from databricks.koalas.series import Series
        if key.startswith("__") or key.startswith(
                "_pandas_") or key.startswith("_spark_"):
            raise AttributeError(key)
        if hasattr(_MissingPandasLikeDataFrame, key):
            return partial(getattr(_MissingPandasLikeDataFrame, key), self)
        return Series(self._sdf.__getattr__(key), self,
                      self._metadata.index_info)

    def __iter__(self):
        return self.toPandas().__iter__()

    def __len__(self):
        return self._sdf.count()

    def __dir__(self):
        fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f]
        return super(DataFrame, self).__dir__() + fields

    def _repr_html_(self):
        return self.head(max_display_count).toPandas()._repr_html_()

    @classmethod
    def _validate_axis(cls, axis=0):
        if axis not in (0, 1, 'index', 'columns', None):
            raise ValueError('No axis named {0}'.format(axis))
        # convert to numeric axis
        return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
Ejemplo n.º 11
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64
        """
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
Ejemplo n.º 12
0
class DataFrame(_Frame):
    """
    Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame
    internally.

    :ivar _sdf: Spark Column instance
    :ivar _metadata: Metadata related to column names and index information.
    """
    @derived_from(pd.DataFrame)
    @dispatch_on('data')
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 dtype=None,
                 copy=False):
        pdf = pd.DataFrame(data=data,
                           index=index,
                           columns=columns,
                           dtype=dtype,
                           copy=copy)
        self._init_from_pandas(pdf)

    @__init__.register(pd.DataFrame)
    def _init_from_pandas(self, pdf, *args):
        metadata = Metadata.from_pandas(pdf)
        reset_index = pdf.reset_index()
        reset_index.columns = metadata.all_fields
        schema = StructType([
            StructField(name,
                        infer_pd_series_spark_type(col),
                        nullable=bool(col.isnull().any()))
            for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        self._init_from_spark(
            default_session().createDataFrame(reset_index, schema=schema),
            metadata)

    @__init__.register(spark.DataFrame)
    def _init_from_spark(self, sdf, metadata=None, *args):
        self._sdf = sdf
        if metadata is None:
            self._metadata = Metadata(
                column_fields=self._sdf.schema.fieldNames())
        else:
            self._metadata = metadata

    @property
    def _index_columns(self):
        return [
            self._sdf.__getitem__(field)
            for field in self._metadata.index_fields
        ]

    def _reduce_for_stat_function(self, sfun):
        """
        Applies sfun to each column and returns a pd.Series where the number of rows equal the
        number of columns.

        :param sfun: either an 1-arg function that takes a Column and returns a Column, or
        a 2-arg function that takes a Column and its DataType and returns a Column.
        """
        from inspect import signature
        exprs = []
        num_args = len(signature(sfun).parameters)
        for col in self.columns:
            col_sdf = self._sdf[col]
            col_type = self._sdf.schema[col].dataType
            if isinstance(col_type,
                          BooleanType) and sfun.__name__ not in ('min', 'max'):
                # Stat functions cannot be used with boolean values by default
                # Thus, cast to integer (true to 1 and false to 0)
                # Exclude the min and max methods though since those work with booleans
                col_sdf = col_sdf.cast('integer')
            if num_args == 1:
                # Only pass in the column if sfun accepts only one arg
                col_sdf = sfun(col_sdf)
            else:  # must be 2
                assert num_args == 2
                # Pass in both the column and its data type if sfun accepts two args
                col_sdf = sfun(col_sdf, col_type)
            exprs.append(col_sdf.alias(col))

        sdf = self._sdf.select(*exprs)
        pdf = sdf.toPandas()
        assert len(pdf) == 1, (sdf, pdf)
        row = pdf.iloc[0]
        row.name = None
        return row  # Return first row as a Series

    def corr(self, method='pearson'):
        """
        Compute pairwise correlation of columns, excluding NA/null values.

        Parameters
        ----------
        method : {'pearson', 'spearman'}
            * pearson : standard correlation coefficient
            * spearman : Spearman rank correlation

        Returns
        -------
        y : pandas.DataFrame

        See Also
        --------
        Series.corr

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'])
        >>> df.corr('pearson')
                  dogs      cats
        dogs  1.000000 -0.851064
        cats -0.851064  1.000000

        >>> df.corr('spearman')
                  dogs      cats
        dogs  1.000000 -0.948683
        cats -0.948683  1.000000

        Notes
        -----
        There are behavior differences between Koalas and pandas.

        * the `method` argument only accepts 'pearson', 'spearman'
        * the data should not contain NaNs. Koalas will return an error.
        * Koalas doesn't support the following argument(s).

          * `min_periods` argument is not supported
        """
        return corr(self, method)

    @derived_from(pd.DataFrame)
    def iteritems(self):
        cols = list(self.columns)
        return list((col_name, self[col_name]) for col_name in cols)

    def to_html(self,
                buf=None,
                columns=None,
                col_space=None,
                header=True,
                index=True,
                na_rep='NaN',
                formatters=None,
                float_format=None,
                sparsify=None,
                index_names=True,
                justify=None,
                max_rows=None,
                max_cols=None,
                show_dimensions=False,
                decimal='.',
                bold_rows=True,
                classes=None,
                escape=True,
                notebook=False,
                border=None,
                table_id=None,
                render_links=False):
        """
        Render a DataFrame as an HTML table.

        .. note:: This method should only be used if the resulting Pandas object is expected
                  to be small, as all the data is loaded into the driver's memory. If the input
                  is large, set max_rows parameter.

        Parameters
        ----------
        buf : StringIO-like, optional
            Buffer to write to.
        columns : sequence, optional, default None
            The subset of columns to write. Writes all columns by default.
        col_space : int, optional
            The minimum width of each column.
        header : bool, optional
            Write out the column names. If a list of strings is given, it
            is assumed to be aliases for the column names
        index : bool, optional, default True
            Whether to print index (row) labels.
        na_rep : str, optional, default 'NaN'
            String representation of NAN to use.
        formatters : list or dict of one-param. functions, optional
            Formatter functions to apply to columns' elements by position or
            name.
            The result of each function must be a unicode string.
            List must be of length equal to the number of columns.
        float_format : one-parameter function, optional, default None
            Formatter function to apply to columns' elements if they are
            floats. The result of this function must be a unicode string.
        sparsify : bool, optional, default True
            Set to False for a DataFrame with a hierarchical index to print
            every multiindex key at each row.
        index_names : bool, optional, default True
            Prints the names of the indexes.
        justify : str, default None
            How to justify the column labels. If None uses the option from
            the print configuration (controlled by set_option), 'right' out
            of the box. Valid values are

            * left
            * right
            * center
            * justify
            * justify-all
            * start
            * end
            * inherit
            * match-parent
            * initial
            * unset.
        max_rows : int, optional
            Maximum number of rows to display in the console.
        max_cols : int, optional
            Maximum number of columns to display in the console.
        show_dimensions : bool, default False
            Display DataFrame dimensions (number of rows by number of columns).
        decimal : str, default '.'
            Character recognized as decimal separator, e.g. ',' in Europe.
        bold_rows : bool, default True
            Make the row labels bold in the output.
        classes : str or list or tuple, default None
            CSS class(es) to apply to the resulting html table.
        escape : bool, default True
            Convert the characters <, >, and & to HTML-safe sequences.
        notebook : {True, False}, default False
            Whether the generated HTML is for IPython Notebook.
        border : int
            A ``border=border`` attribute is included in the opening
            `<table>` tag. Default ``pd.options.html.border``.
        table_id : str, optional
            A css id is included in the opening `<table>` tag if specified.
        render_links : bool, default False
            Convert URLs to HTML links (only works with Pandas 0.24+).

        Returns
        -------
        str (or unicode, depending on data and options)
            String representation of the dataframe.

        See Also
        --------
        to_string : Convert DataFrame to a string.
        """
        # Make sure locals() call is at the top of the function so we don't capture local variables.
        args = locals()
        if max_rows is not None:
            kdf = self.head(max_rows)
        else:
            kdf = self

        return validate_arguments_and_invoke_function(kdf.to_pandas(),
                                                      self.to_html,
                                                      pd.DataFrame.to_html,
                                                      args)

    def to_string(self,
                  buf=None,
                  columns=None,
                  col_space=None,
                  header=True,
                  index=True,
                  na_rep='NaN',
                  formatters=None,
                  float_format=None,
                  sparsify=None,
                  index_names=True,
                  justify=None,
                  max_rows=None,
                  max_cols=None,
                  show_dimensions=False,
                  decimal='.',
                  line_width=None):
        """
        Render a DataFrame to a console-friendly tabular output.

        .. note:: This method should only be used if the resulting Pandas object is expected
                  to be small, as all the data is loaded into the driver's memory. If the input
                  is large, set max_rows parameter.

        Parameters
        ----------
        buf : StringIO-like, optional
            Buffer to write to.
        columns : sequence, optional, default None
            The subset of columns to write. Writes all columns by default.
        col_space : int, optional
            The minimum width of each column.
        header : bool, optional
            Write out the column names. If a list of strings is given, it
            is assumed to be aliases for the column names
        index : bool, optional, default True
            Whether to print index (row) labels.
        na_rep : str, optional, default 'NaN'
            String representation of NAN to use.
        formatters : list or dict of one-param. functions, optional
            Formatter functions to apply to columns' elements by position or
            name.
            The result of each function must be a unicode string.
            List must be of length equal to the number of columns.
        float_format : one-parameter function, optional, default None
            Formatter function to apply to columns' elements if they are
            floats. The result of this function must be a unicode string.
        sparsify : bool, optional, default True
            Set to False for a DataFrame with a hierarchical index to print
            every multiindex key at each row.
        index_names : bool, optional, default True
            Prints the names of the indexes.
        justify : str, default None
            How to justify the column labels. If None uses the option from
            the print configuration (controlled by set_option), 'right' out
            of the box. Valid values are

            * left
            * right
            * center
            * justify
            * justify-all
            * start
            * end
            * inherit
            * match-parent
            * initial
            * unset.
        max_rows : int, optional
            Maximum number of rows to display in the console.
        max_cols : int, optional
            Maximum number of columns to display in the console.
        show_dimensions : bool, default False
            Display DataFrame dimensions (number of rows by number of columns).
        decimal : str, default '.'
            Character recognized as decimal separator, e.g. ',' in Europe.
        line_width : int, optional
            Width to wrap a line in characters.

        Returns
        -------
        str (or unicode, depending on data and options)
            String representation of the dataframe.

        See Also
        --------
        to_html : Convert DataFrame to HTML.

        Examples
        --------
        >>> df = ks.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
        >>> print(df.to_string())
           col1  col2
        0     1     4
        1     2     5
        2     3     6

        >>> print(df.to_string(max_rows=2))
           col1  col2
        0     1     4
        1     2     5
        """
        # Make sure locals() call is at the top of the function so we don't capture local variables.
        args = locals()
        if max_rows is not None:
            kdf = self.head(max_rows)
        else:
            kdf = self

        return validate_arguments_and_invoke_function(kdf.to_pandas(),
                                                      self.to_string,
                                                      pd.DataFrame.to_string,
                                                      args)

    @property
    def index(self):
        """The index (row labels) Column of the DataFrame.

        Currently supported only when the DataFrame has a single index.
        """
        from databricks.koalas.series import Series
        if len(self._metadata.index_info) != 1:
            raise KeyError(
                'Currently supported only when the DataFrame has a single index.'
            )
        return Series(self._index_columns[0], self, [])

    def set_index(self, keys, drop=True, append=False, inplace=False):
        """Set the DataFrame index (row labels) using one or more existing columns. By default
        yields a new object.

        :param keys: column label or list of column labels / arrays
        :param drop: boolean, default True
                     Delete columns to be used as the new index
        :param append: boolean, default False
                       Whether to append columns to existing index
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if isinstance(keys, string_types):
            keys = [keys]
        else:
            keys = list(keys)
        for key in keys:
            if key not in self.columns:
                raise KeyError(key)

        if drop:
            columns = [
                column for column in self._metadata.column_fields
                if column not in keys
            ]
        else:
            columns = self._metadata.column_fields
        if append:
            index_info = self._metadata.index_info + [(column, column)
                                                      for column in keys]
        else:
            index_info = [(column, column) for column in keys]

        metadata = self._metadata.copy(column_fields=columns,
                                       index_info=index_info)
        if inplace:
            self._metadata = metadata
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            return kdf

    def reset_index(self, level=None, drop=False, inplace=False):
        """For DataFrame with multi-level index, return new DataFrame with labeling information in
        the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None.
        For a standard index, the index name will be used (if set), otherwise a default 'index' or
        'level_0' (if 'index' is already taken) will be used.

        :param level: int, str, tuple, or list, default None
                      Only remove the given levels from the index. Removes all levels by default
        :param drop: boolean, default False
                     Do not try to insert index into dataframe columns. This resets the index to the
                     default integer index.
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if len(self._metadata.index_info) == 0:
            raise NotImplementedError(
                'Can\'t reset index because there is no index.')

        multi_index = len(self._metadata.index_info) > 1
        if multi_index:
            rename = lambda i: 'level_{}'.format(i)
        else:
            rename = lambda i: \
                'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i)

        if level is None:
            index_columns = [
                (column, name if name is not None else rename(i))
                for i, (column, name) in enumerate(self._metadata.index_info)
            ]
            index_info = []
        else:
            if isinstance(level, (int, string_types)):
                level = [level]
            level = list(level)

            if all(isinstance(l, int) for l in level):
                for l in level:
                    if l >= len(self._metadata.index_info):
                        raise IndexError(
                            'Too many levels: Index has only {} level, not {}'.
                            format(len(self._metadata.index_info), l + 1))
                idx = level
            elif all(isinstance(l, string_types) for l in level):
                idx = []
                for l in level:
                    try:
                        i = self._metadata.index_fields.index(l)
                        idx.append(i)
                    except ValueError:
                        if multi_index:
                            raise KeyError('Level unknown not found')
                        else:
                            raise KeyError(
                                'Level unknown must be same as name ({})'.
                                format(self._metadata.index_fields[0]))
            else:
                raise ValueError('Level should be all int or all string.')
            idx.sort()

            index_columns = []
            index_info = self._metadata.index_info.copy()
            for i in idx:
                info = self._metadata.index_info[i]
                column_field, index_name = info
                index_columns.append((column_field, index_name if index_name
                                      is not None else rename(index_name)))
                index_info.remove(info)

        if drop:
            index_columns = []

        metadata = self._metadata.copy(
            column_fields=[column for column, _ in index_columns] +
            self._metadata.column_fields,
            index_info=index_info)
        columns = [name
                   for _, name in index_columns] + self._metadata.column_fields
        if inplace:
            self._metadata = metadata
            self.columns = columns
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            kdf.columns = columns
            return kdf

    def isnull(self):
        """
        Detects missing values for items in the current Dataframe.

        Return a boolean same-sized Dataframe indicating if the values are NA.
        NA values, such as None or numpy.NaN, gets mapped to True values.
        Everything else gets mapped to False values.

        See Also
        --------
        Dataframe.notnull

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)])
        >>> df.isnull()
               0      1
        0  False  False
        1  False   True
        2  False   True
        3  False  False

        >>> df = ks.DataFrame([[None, 'bee', None], ['dog', None, 'fly']])
        >>> df.isnull()
               0      1      2
        0   True  False   True
        1  False   True  False
        """
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.isnull()
        return kdf

    isna = isnull

    def notnull(self):
        """
        Detects non-missing values for items in the current Dataframe.

        This function takes a dataframe and indicates whether it's
        values are valid (not missing, which is ``NaN`` in numeric
        datatypes, ``None`` or ``NaN`` in objects and ``NaT`` in datetimelike).

        See Also
        --------
        Dataframe.isnull

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)])
        >>> df.notnull()
              0      1
        0  True   True
        1  True  False
        2  True  False
        3  True   True

        >>> df = ks.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
        >>> df.notnull()
              0      1     2
        0  True   True  True
        1  True  False  True
        """
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.notnull()
        return kdf

    notna = notnull

    def to_koalas(self):
        """
        Converts the existing DataFrame into a Koalas DataFrame.

        This method is monkey-patched into Spark's DataFrame and can be used
        to convert a Spark DataFrame into a Koalas DataFrame. If running on
        an existing Koalas DataFrame, the method returns itself.

        If a Koalas DataFrame is converted to a Spark DataFrame and then back
        to Koalas, it will lose the index information and the original index
        will be turned into a normal column.

        See Also
        --------
        DataFrame.to_spark

        Examples
        --------
        >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
        >>> df
           col1  col2
        0     1     3
        1     2     4

        >>> spark_df = df.to_spark()
        >>> spark_df
        DataFrame[__index_level_0__: bigint, col1: bigint, col2: bigint]

        >>> kdf = spark_df.to_koalas()
        >>> kdf
           __index_level_0__  col1  col2
        0                  0     1     3
        1                  1     2     4
        """
        if isinstance(self, DataFrame):
            return self
        else:
            return DataFrame(self)

    def to_spark(self):
        """
        Return the current DataFrame as a Spark DataFrame.

        See Also
        --------
        DataFrame.to_koalas
        """
        return self._sdf

    def to_pandas(self):
        """
        Return a Pandas DataFrame.

        .. note:: This method should only be used if the resulting Pandas DataFrame is expected
            to be small, as all the data is loaded into the driver's memory.

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'])
        >>> df.to_pandas()
           dogs  cats
        0   0.2   0.3
        1   0.0   0.6
        2   0.6   0.0
        3   0.2   0.1
        """
        sdf = self._sdf.select(
            ['`{}`'.format(name) for name in self._metadata.all_fields])
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            # TODO: push to OSS
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })
        if len(self._metadata.index_info) > 0:
            append = False
            for index_field in self._metadata.index_fields:
                drop = index_field not in self._metadata.column_fields
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[self._metadata.column_fields]
        index_names = self._metadata.index_names
        if len(index_names) > 0:
            if isinstance(pdf.index, pd.MultiIndex):
                pdf.index.names = index_names
            else:
                pdf.index.name = index_names[0]
        return pdf

    # Alias to maintain backward compatibility with Spark
    toPandas = to_pandas

    @derived_from(pd.DataFrame)
    def assign(self, **kwargs):
        from databricks.koalas.series import Series
        for k, v in kwargs.items():
            if not (isinstance(v, (Series, spark.Column)) or callable(v)
                    or pd.api.types.is_scalar(v)):
                raise TypeError("Column assignment doesn't support type "
                                "{0}".format(type(v).__name__))
            if callable(v):
                kwargs[k] = v(self)

        pairs = list(kwargs.items())
        sdf = self._sdf
        for (name, c) in pairs:
            if isinstance(c, Series):
                sdf = sdf.withColumn(name, c._scol)
            elif isinstance(c, Column):
                sdf = sdf.withColumn(name, c)
            else:
                sdf = sdf.withColumn(name, F.lit(c))

        metadata = self._metadata.copy(
            column_fields=(self._metadata.column_fields + [
                name for name, _ in pairs
                if name not in self._metadata.column_fields
            ]))
        return DataFrame(sdf, metadata)

    @property
    def loc(self):
        return SparkDataFrameLocator(self)

    def copy(self):
        return DataFrame(self._sdf, self._metadata.copy())

    @derived_from(pd.DataFrame)
    def dropna(self,
               axis=0,
               how='any',
               thresh=None,
               subset=None,
               inplace=False):
        if axis == 0 or axis == 'index':
            if subset is not None:
                if isinstance(subset, string_types):
                    columns = [subset]
                else:
                    columns = list(subset)
                invalids = [
                    column for column in columns
                    if column not in self._metadata.column_fields
                ]
                if len(invalids) > 0:
                    raise KeyError(invalids)
            else:
                columns = list(self.columns)

            cnt = reduce(lambda x, y: x + y, [
                F.when(self[column].notna()._scol, 1).otherwise(0)
                for column in columns
            ], F.lit(0))
            if thresh is not None:
                pred = cnt >= F.lit(int(thresh))
            elif how == 'any':
                pred = cnt == F.lit(len(columns))
            elif how == 'all':
                pred = cnt > F.lit(0)
            else:
                if how is not None:
                    raise ValueError('invalid how option: {h}'.format(h=how))
                else:
                    raise TypeError('must specify how or thresh')

            sdf = self._sdf.filter(pred)
            if inplace:
                self._sdf = sdf
            else:
                return DataFrame(sdf, self._metadata.copy())

        else:
            raise NotImplementedError(
                "dropna currently only works for axis=0 or axis='index'")

    def fillna(self, value=None, axis=None, inplace=False):
        """Fill NA/NaN values.

        :param value: scalar, dict, Series
                    Value to use to fill holes. alternately a dict/Series of values
                    specifying which value to use for each column.
                    DataFrame is not supported.
        :param axis: {0 or `index`}
                    1 and `columns` are not supported.
        :param inplace: boolean, default False
                    Fill in place (do not create a new object)
        :return: :class:`DataFrame`

        Examples
        --------
        >>> df = ks.DataFrame({
        ...     'A': [None, 3, None, None],
        ...     'B': [2, 4, None, 3],
        ...     'C': [None, None, None, 1],
        ...     'D': [0, 1, 5, 4]
        ...     })
        >>> df
             A    B    C  D
        0  NaN  2.0  NaN  0
        1  3.0  4.0  NaN  1
        2  NaN  NaN  NaN  5
        3  NaN  3.0  1.0  4

        Replace all NaN elements with 0s.

        >>> df.fillna(0)
             A    B    C  D
        0  0.0  2.0  0.0  0
        1  3.0  4.0  0.0  1
        2  0.0  0.0  0.0  5
        3  0.0  3.0  1.0  4

        Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
        2, and 3 respectively.

        >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
        >>> df.fillna(value=values)
             A    B    C  D
        0  0.0  2.0  2.0  0
        1  3.0  4.0  2.0  1
        2  0.0  1.0  2.0  5
        3  0.0  3.0  1.0  4
        """
        if axis is None:
            axis = 0
        if not (axis == 0 or axis == "index"):
            raise NotImplementedError(
                "fillna currently only works for axis=0 or axis='index'")

        if value is None:
            raise ValueError('Currently must specify value')
        if not isinstance(value, (float, int, str, bool, dict, pd.Series)):
            raise TypeError("Unsupported type %s" % type(value))
        if isinstance(value, pd.Series):
            value = value.to_dict()
        if isinstance(value, dict):
            for v in value.values():
                if not isinstance(v, (float, int, str, bool)):
                    raise TypeError("Unsupported type %s" % type(v))

        sdf = self._sdf.fillna(value)
        if inplace:
            self._sdf = sdf
        else:
            return DataFrame(sdf, self._metadata.copy())

    def head(self, n=5):
        """
        Return the first `n` rows.

        This function returns the first `n` rows for the object based
        on position. It is useful for quickly testing if your object
        has the right type of data in it.

        Parameters
        ----------
        n : int, default 5
            Number of rows to select.

        Returns
        -------
        obj_head : same type as caller
            The first `n` rows of the caller object.

        Examples
        --------
        >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
        ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
        >>> df
              animal
        0  alligator
        1        bee
        2     falcon
        3       lion
        4     monkey
        5     parrot
        6      shark
        7      whale
        8      zebra

        Viewing the first 5 lines

        >>> df.head()
              animal
        0  alligator
        1        bee
        2     falcon
        3       lion
        4     monkey

        Viewing the first `n` lines (three in this case)

        >>> df.head(3)
              animal
        0  alligator
        1        bee
        2     falcon
        """

        return DataFrame(self._sdf.limit(n), self._metadata.copy())

    @property
    def columns(self):
        """The column labels of the DataFrame."""
        return pd.Index(self._metadata.column_fields)

    @columns.setter
    def columns(self, names):
        old_names = self._metadata.column_fields
        if len(old_names) != len(names):
            raise ValueError(
                "Length mismatch: Expected axis has %d elements, new values have %d elements"
                % (len(old_names), len(names)))
        sdf = self._sdf.select(self._metadata.index_fields + [
            self[old_name]._scol.alias(new_name)
            for (old_name, new_name) in zip(old_names, names)
        ])
        self._sdf = sdf
        self._metadata = self._metadata.copy(column_fields=names)

    @property
    def dtypes(self):
        """Return the dtypes in the DataFrame.

        This returns a Series with the data type of each column. The result's index is the original
        DataFrame's columns. Columns with mixed types are stored with the object dtype.

        :return: :class:`pd.Series` The data type of each column.

        Examples
        --------
        >>> df = ks.DataFrame({'a': list('abc'),
        ...                    'b': list(range(1, 4)),
        ...                    'c': np.arange(3, 6).astype('i1'),
        ...                    'd': np.arange(4.0, 7.0, dtype='float64'),
        ...                    'e': [True, False, True],
        ...                    'f': pd.date_range('20130101', periods=3)})
        >>> df.dtypes
        a            object
        b             int64
        c              int8
        d           float64
        e              bool
        f    datetime64[ns]
        dtype: object
        """
        return pd.Series(
            [self[col].dtype for col in self._metadata.column_fields],
            index=self._metadata.column_fields)

    def count(self):
        """
        Count non-NA cells for each column.

        The values `None`, `NaN` are considered NA.

        Returns
        -------
        pandas.Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.
        DataFrame.shape: Number of DataFrame rows and columns (including NA
            elements).
        DataFrame.isna: Boolean same-sized DataFrame showing places of NA
            elements.

        Examples
        --------
        Constructing DataFrame from a dictionary:

        >>> df = ks.DataFrame({"Person":
        ...                    ["John", "Myla", "Lewis", "John", "Myla"],
        ...                    "Age": [24., np.nan, 21., 33, 26],
        ...                    "Single": [False, True, True, True, False]})
        >>> df
          Person   Age  Single
        0   John  24.0   False
        1   Myla   NaN    True
        2  Lewis  21.0    True
        3   John  33.0    True
        4   Myla  26.0   False

        Notice the uncounted NA values:

        >>> df.count()
        Person    5
        Age       4
        Single    5
        dtype: int64
        """
        return self._reduce_for_stat_function(_Frame._count_expr)

    def unique(self):
        sdf = self._sdf
        return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx),
                         self._metadata.copy())

    @derived_from(pd.DataFrame)
    def drop(self, labels, axis=0, errors='raise'):
        axis = self._validate_axis(axis)
        if axis == 1:
            if isinstance(labels, list):
                sdf = self._sdf.drop(*labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column not in labels
                ])
            else:
                sdf = self._sdf.drop(labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column != labels
                ])
            return DataFrame(sdf, metadata)
        raise NotImplementedError("Drop currently only works for axis=1")

    @derived_from(pd.DataFrame)
    def get(self, key, default=None):
        try:
            return self._pd_getitem(key)
        except (KeyError, ValueError, IndexError):
            return default

    def sort_values(self,
                    by,
                    ascending=True,
                    inplace=False,
                    na_position='last'):
        """
        Sort by the values along either axis.

        Parameters
        ----------
        by : str or list of str
        ascending : bool or list of bool, default True
             Sort ascending vs. descending. Specify list for multiple sort
             orders.  If this is a list of bools, must match the length of
             the by.
        inplace : bool, default False
             if True, perform operation in-place
        na_position : {'first', 'last'}, default 'last'
             `first` puts NaNs at the beginning, `last` puts NaNs at the end

        Returns
        -------
        sorted_obj : DataFrame

        Examples
        --------
        >>> df = ks.DataFrame({
        ...     'col1': ['A', 'A', 'B', None, 'D', 'C'],
        ...     'col2': [2, 1, 9, 8, 7, 4],
        ...     'col3': [0, 1, 9, 4, 2, 3],
        ... })
        >>> df
           col1  col2  col3
        0     A     2     0
        1     A     1     1
        2     B     9     9
        3  None     8     4
        4     D     7     2
        5     C     4     3

        Sort by col1

        >>> df.sort_values(by=['col1'])
           col1  col2  col3
        0     A     2     0
        1     A     1     1
        2     B     9     9
        5     C     4     3
        4     D     7     2
        3  None     8     4


        Sort by multiple columns

        >>> df.sort_values(by=['col1', 'col2'])
           col1  col2  col3
        1     A     1     1
        0     A     2     0
        2     B     9     9
        5     C     4     3
        4     D     7     2
        3  None     8     4

        Sort Descending

        >>> df.sort_values(by='col1', ascending=False)
           col1  col2  col3
        4     D     7     2
        5     C     4     3
        2     B     9     9
        0     A     2     0
        1     A     1     1
        3  None     8     4
        """
        if isinstance(by, string_types):
            by = [by]
        if isinstance(ascending, bool):
            ascending = [ascending] * len(by)
        if len(ascending) != len(by):
            raise ValueError(
                'Length of ascending ({}) != length of by ({})'.format(
                    len(ascending), len(by)))
        if na_position not in ('first', 'last'):
            raise ValueError("invalid na_position: '{}'".format(na_position))

        # Mapper: Get a spark column function for (ascending, na_position) combination
        # Note that 'asc_nulls_first' and friends were added as of Spark 2.4, see SPARK-23847.
        mapper = {
            (True, 'first'):
            lambda x: Column(getattr(x._jc, "asc_nulls_first")()),
            (True, 'last'):
            lambda x: Column(getattr(x._jc, "asc_nulls_last")()),
            (False, 'first'):
            lambda x: Column(getattr(x._jc, "desc_nulls_first")()),
            (False, 'last'):
            lambda x: Column(getattr(x._jc, "desc_nulls_last")()),
        }
        by = [
            mapper[(asc, na_position)](self[colname]._scol)
            for colname, asc in zip(by, ascending)
        ]
        kdf = DataFrame(self._sdf.sort(*by), self._metadata.copy())
        if inplace:
            self._sdf: spark.DataFrame = kdf._sdf
            self._metadata = kdf._metadata
        else:
            return kdf

    @derived_from(pd.DataFrame)
    def pipe(self, func, *args, **kwargs):
        # Taken from pandas:
        # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707
        if isinstance(func, tuple):
            func, target = func
            if target in kwargs:
                raise ValueError('%s is both the pipe target and a keyword '
                                 'argument' % target)
            kwargs[target] = self
            return func(*args, **kwargs)
        else:
            return func(self, *args, **kwargs)

    @property
    def shape(self):
        """
        Return a tuple representing the dimensionality of the DataFrame.

        Examples
        --------
        >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
        >>> df.shape
        (2, 2)

        >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4],
        ...                    'col3': [5, 6]})
        >>> df.shape
        (2, 3)
        """
        return len(self), len(self.columns)

    def _pd_getitem(self, key):
        from databricks.koalas.series import Series
        if key is None:
            raise KeyError("none key")
        if isinstance(key, string_types):
            try:
                return Series(self._sdf.__getitem__(key), self,
                              self._metadata.index_info)
            except AnalysisException:
                raise KeyError(key)
        if np.isscalar(key) or isinstance(key, (tuple, string_types)):
            raise NotImplementedError(key)
        elif isinstance(key, slice):
            return self.loc[key]

        if isinstance(key, (pd.Series, np.ndarray, pd.Index)):
            raise NotImplementedError(key)
        if isinstance(key, list):
            return self.loc[:, key]
        if isinstance(key, DataFrame):
            # TODO Should not implement alignment, too dangerous?
            return Series(self._sdf.__getitem__(key), self,
                          self._metadata.index_info)
        if isinstance(key, Series):
            # TODO Should not implement alignment, too dangerous?
            # It is assumed to be only a filter, otherwise .loc should be used.
            bcol = key._scol.cast("boolean")
            return DataFrame(self._sdf.filter(bcol), self._metadata.copy())
        raise NotImplementedError(key)

    def __repr__(self):
        return repr(self.toPandas())

    def __getitem__(self, key):
        return self._pd_getitem(key)

    def __setitem__(self, key, value):
        from databricks.koalas.series import Series
        # For now, we don't support realignment against different dataframes.
        # This is too expensive in Spark.
        # Are we assigning against a column?
        if isinstance(value, Series):
            assert value._kdf is self, \
                "Cannot combine column argument because it comes from a different dataframe"
        if isinstance(key, (tuple, list)):
            assert isinstance(value.schema, StructType)
            field_names = value.schema.fieldNames()
            kdf = self.assign(
                **{k: value[c]
                   for k, c in zip(key, field_names)})
        else:
            kdf = self.assign(**{key: value})

        self._sdf: spark.DataFrame = kdf._sdf
        self._metadata = kdf._metadata

    def __getattr__(self, key):
        from databricks.koalas.series import Series
        if key.startswith("__") or key.startswith(
                "_pandas_") or key.startswith("_spark_"):
            raise AttributeError(key)
        if hasattr(_MissingPandasLikeDataFrame, key):
            return partial(getattr(_MissingPandasLikeDataFrame, key), self)
        return Series(self._sdf.__getattr__(key), self,
                      self._metadata.index_info)

    def __iter__(self):
        return self.toPandas().__iter__()

    def __len__(self):
        return self._sdf.count()

    def __dir__(self):
        fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f]
        return super(DataFrame, self).__dir__() + fields

    def _repr_html_(self):
        return self.head(max_display_count).toPandas()._repr_html_()

    @classmethod
    def _validate_axis(cls, axis=0):
        if axis not in (0, 1, 'index', 'columns', None):
            raise ValueError('No axis named {0}'.format(axis))
        # convert to numeric axis
        return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
Ejemplo n.º 13
0
 def to_dataframe(self):
     sdf = self._kdf._sdf.select([field for field, _ in self._index_map] + [self._scol])
     metadata = Metadata(data_columns=[sdf.schema[-1].name], index_map=self._index_map)
     return DataFrame(sdf, metadata)
Ejemplo n.º 14
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to aggregate functions (string).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        Examples
        --------

        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]})
        >>> df = df[['A', 'B', 'C']]

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})
        >>> aggregated[['B', 'C']]  # doctest: +NORMALIZE_WHITESPACE
           B      C
        A
        1  1  0.589
        2  3  0.705

        """
        if not isinstance(func_or_funcs, dict) or \
            not all(isinstance(key, string_types) and isinstance(value, string_types)
                    for key, value in func_or_funcs.items()):
            raise ValueError("aggs must be a dict mapping from column name (string) to aggregate "
                             "functions (string).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i))
                         for i, s in enumerate(groupkeys)]
        reordered = [F.expr('{1}({0}) as {0}'.format(key, value))
                     for key, value in func_or_funcs.items()]
        sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
        metadata = Metadata(column_fields=[key for key, _ in func_or_funcs.items()],
                            index_info=[('__index_level_{}__'.format(i), s.name)
                                        for i, s in enumerate(groupkeys)])
        return DataFrame(sdf, metadata)
Ejemplo n.º 15
0
class DataFrame(_Frame):
    """
    Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame
    internally.

    :ivar _sdf: Spark Column instance
    :ivar _metadata: Metadata related to column names and index information.
    """
    @derived_from(pd.DataFrame)
    @dispatch_on('data')
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 dtype=None,
                 copy=False):
        pdf = pd.DataFrame(data=data,
                           index=index,
                           columns=columns,
                           dtype=dtype,
                           copy=copy)
        self._init_from_pandas(pdf)

    @__init__.register(pd.DataFrame)
    def _init_from_pandas(self, pdf, *args):
        metadata = Metadata.from_pandas(pdf)
        reset_index = pdf.reset_index()
        reset_index.columns = metadata.all_fields
        self._init_from_spark(default_session().createDataFrame(reset_index),
                              metadata)

    @__init__.register(spark.DataFrame)
    def _init_from_spark(self, sdf, metadata=None, *args):
        self._sdf = sdf
        if metadata is None:
            self._metadata = Metadata(
                column_fields=self._sdf.schema.fieldNames())
        else:
            self._metadata = metadata

    @property
    def _index_columns(self):
        return [
            self._sdf.__getitem__(field)
            for field in self._metadata.index_fields
        ]

    def _reduce_for_stat_function(self, sfun):
        sdf = self._sdf.select(
            [sfun(self._sdf[col]).alias(col) for col in self.columns])
        pdf = sdf.toPandas()
        assert len(pdf) == 1, (sdf, pdf)
        row = pdf.iloc[0]
        row.name = None
        return row  # Return first row as a Series

    @derived_from(pd.DataFrame)
    def iteritems(self):
        cols = list(self.columns)
        return list((col_name, self[col_name]) for col_name in cols)

    @derived_from(pd.DataFrame)
    def to_html(self,
                buf=None,
                columns=None,
                col_space=None,
                header=True,
                index=True,
                na_rep='NaN',
                formatters=None,
                float_format=None,
                sparsify=None,
                index_names=True,
                justify=None,
                max_rows=None,
                max_cols=None,
                show_dimensions=False,
                decimal='.',
                bold_rows=True,
                classes=None,
                escape=True,
                notebook=False,
                border=None,
                table_id=None,
                render_links=False):
        return self.toPandas().to_html(buf=buf,
                                       columns=columns,
                                       col_space=col_space,
                                       header=header,
                                       index=index,
                                       na_rep=na_rep,
                                       formatters=formatters,
                                       float_format=float_format,
                                       sparsify=sparsify,
                                       index_names=index_names,
                                       justify=justify,
                                       max_rows=max_rows,
                                       max_cols=max_cols,
                                       show_dimensions=show_dimensions,
                                       decimal=decimal,
                                       bold_rows=bold_rows,
                                       classes=classes,
                                       escape=escape,
                                       notebook=notebook,
                                       border=border,
                                       table_id=table_id,
                                       render_links=render_links)

    @property
    def index(self):
        """The index (row labels) Column of the DataFrame.

        Currently supported only when the DataFrame has a single index.
        """
        from databricks.koalas.series import Series
        if len(self._metadata.index_info) != 1:
            raise KeyError(
                'Currently supported only when the DataFrame has a single index.'
            )
        return Series(self._index_columns[0], self, [])

    def set_index(self, keys, drop=True, append=False, inplace=False):
        """Set the DataFrame index (row labels) using one or more existing columns. By default
        yields a new object.

        :param keys: column label or list of column labels / arrays
        :param drop: boolean, default True
                     Delete columns to be used as the new index
        :param append: boolean, default False
                       Whether to append columns to existing index
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if isinstance(keys, string_types):
            keys = [keys]
        else:
            keys = list(keys)
        for key in keys:
            if key not in self.columns:
                raise KeyError(key)

        if drop:
            columns = [
                column for column in self._metadata.column_fields
                if column not in keys
            ]
        else:
            columns = self._metadata.column_fields
        if append:
            index_info = self._metadata.index_info + [(column, column)
                                                      for column in keys]
        else:
            index_info = [(column, column) for column in keys]

        metadata = self._metadata.copy(column_fields=columns,
                                       index_info=index_info)
        if inplace:
            self._metadata = metadata
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            return kdf

    def reset_index(self, level=None, drop=False, inplace=False):
        """For DataFrame with multi-level index, return new DataFrame with labeling information in
        the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None.
        For a standard index, the index name will be used (if set), otherwise a default 'index' or
        'level_0' (if 'index' is already taken) will be used.

        :param level: int, str, tuple, or list, default None
                      Only remove the given levels from the index. Removes all levels by default
        :param drop: boolean, default False
                     Do not try to insert index into dataframe columns. This resets the index to the
                     default integer index.
        :param inplace: boolean, default False
                        Modify the DataFrame in place (do not create a new object)
        :return: :class:`DataFrame`
        """
        if len(self._metadata.index_info) == 0:
            raise NotImplementedError(
                'Can\'t reset index because there is no index.')

        multi_index = len(self._metadata.index_info) > 1
        if multi_index:
            rename = lambda i: 'level_{}'.format(i)
        else:
            rename = lambda i: \
                'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i)

        if level is None:
            index_columns = [
                (column, name if name is not None else rename(i))
                for i, (column, name) in enumerate(self._metadata.index_info)
            ]
            index_info = []
        else:
            if isinstance(level, (int, string_types)):
                level = [level]
            level = list(level)

            if all(isinstance(l, int) for l in level):
                for l in level:
                    if l >= len(self._metadata.index_info):
                        raise IndexError(
                            'Too many levels: Index has only {} level, not {}'.
                            format(len(self._metadata.index_info), l + 1))
                idx = level
            elif all(isinstance(l, string_types) for l in level):
                idx = []
                for l in level:
                    try:
                        i = self._metadata.index_fields.index(l)
                        idx.append(i)
                    except ValueError:
                        if multi_index:
                            raise KeyError('Level unknown not found')
                        else:
                            raise KeyError(
                                'Level unknown must be same as name ({})'.
                                format(self._metadata.index_fields[0]))
            else:
                raise ValueError('Level should be all int or all string.')
            idx.sort()

            index_columns = []
            index_info = self._metadata.index_info.copy()
            for i in idx:
                info = self._metadata.index_info[i]
                column_field, index_name = info
                index_columns.append((column_field, index_name if index_name
                                      is not None else rename(index_name)))
                index_info.remove(info)

        if drop:
            index_columns = []

        metadata = self._metadata.copy(
            column_fields=[column for column, _ in index_columns] +
            self._metadata.column_fields,
            index_info=index_info)
        columns = [name
                   for _, name in index_columns] + self._metadata.column_fields
        if inplace:
            self._metadata = metadata
            self.columns = columns
        else:
            kdf = self.copy()
            kdf._metadata = metadata
            kdf.columns = columns
            return kdf

    @derived_from(pd.DataFrame)
    def isnull(self):
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.isnull()
        return kdf

    isna = isnull

    @derived_from(pd.DataFrame)
    def notnull(self):
        kdf = self.copy()
        for name, ks in kdf.iteritems():
            kdf[name] = ks.notnull()
        return kdf

    notna = notnull

    @derived_from(spark.DataFrame)
    def toPandas(self):
        sdf = self._sdf.select(
            ['`{}`'.format(name) for name in self._metadata.all_fields])
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            # TODO: push to OSS
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })
        if len(self._metadata.index_info) > 0:
            append = False
            for index_field in self._metadata.index_fields:
                drop = index_field not in self._metadata.column_fields
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[self._metadata.column_fields]
        index_names = self._metadata.index_names
        if len(index_names) > 0:
            if isinstance(pdf.index, pd.MultiIndex):
                pdf.index.names = index_names
            else:
                pdf.index.name = index_names[0]
        return pdf

    @derived_from(pd.DataFrame)
    def assign(self, **kwargs):
        from databricks.koalas.series import Series
        for k, v in kwargs.items():
            if not (isinstance(v, (Series, spark.Column)) or callable(v)
                    or pd.api.types.is_scalar(v)):
                raise TypeError("Column assignment doesn't support type "
                                "{0}".format(type(v).__name__))
            if callable(v):
                kwargs[k] = v(self)

        pairs = list(kwargs.items())
        sdf = self._sdf
        for (name, c) in pairs:
            if isinstance(c, Series):
                sdf = sdf.withColumn(name, c._scol)
            else:
                sdf = sdf.withColumn(name, c)

        metadata = self._metadata.copy(
            column_fields=(self._metadata.column_fields + [
                name for name, _ in pairs
                if name not in self._metadata.column_fields
            ]))
        return DataFrame(sdf, metadata)

    @property
    def loc(self):
        return SparkDataFrameLocator(self)

    def copy(self):
        return DataFrame(self._sdf, self._metadata.copy())

    @derived_from(pd.DataFrame)
    def dropna(self,
               axis=0,
               how='any',
               thresh=None,
               subset=None,
               inplace=False):
        if axis == 0 or axis == 'index':
            if subset is not None:
                if isinstance(subset, string_types):
                    columns = [subset]
                else:
                    columns = list(subset)
                invalids = [
                    column for column in columns
                    if column not in self._metadata.column_fields
                ]
                if len(invalids) > 0:
                    raise KeyError(invalids)
            else:
                columns = list(self.columns)

            cnt = reduce(lambda x, y: x + y, [
                F.when(self[column].notna()._scol, 1).otherwise(0)
                for column in columns
            ], F.lit(0))
            if thresh is not None:
                pred = cnt >= F.lit(int(thresh))
            elif how == 'any':
                pred = cnt == F.lit(len(columns))
            elif how == 'all':
                pred = cnt > F.lit(0)
            else:
                if how is not None:
                    raise ValueError('invalid how option: {h}'.format(h=how))
                else:
                    raise TypeError('must specify how or thresh')

            sdf = self._sdf.filter(pred)
            if inplace:
                self._sdf = sdf
            else:
                return DataFrame(sdf, self._metadata.copy())

        else:
            raise NotImplementedError(
                "dropna currently only works for axis=0 or axis='index'")

    def head(self, n=5):
        return DataFrame(self._sdf.limit(n), self._metadata.copy())

    @property
    def columns(self):
        return pd.Index(self._metadata.column_fields)

    @columns.setter
    def columns(self, names):
        old_names = self._metadata.column_fields
        if len(old_names) != len(names):
            raise ValueError(
                "Length mismatch: Expected axis has %d elements, new values have %d elements"
                % (len(old_names), len(names)))
        sdf = self._sdf.select(self._metadata.index_fields + [
            self[old_name]._scol.alias(new_name)
            for (old_name, new_name) in zip(old_names, names)
        ])
        self._sdf = sdf
        self._metadata = self._metadata.copy(column_fields=names)

    @derived_from(pd.DataFrame, ua_args=['axis', 'level', 'numeric_only'])
    def count(self):
        return self._sdf.count()

    def unique(self):
        sdf = self._sdf
        return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx),
                         self._metadata.copy())

    @derived_from(pd.DataFrame)
    def drop(self, labels, axis=0, errors='raise'):
        axis = self._validate_axis(axis)
        if axis == 1:
            if isinstance(labels, list):
                sdf = self._sdf.drop(*labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column not in labels
                ])
            else:
                sdf = self._sdf.drop(labels)
                metadata = self._metadata.copy(column_fields=[
                    column for column in self._metadata.column_fields
                    if column != labels
                ])
            return DataFrame(sdf, metadata)
        raise NotImplementedError("Drop currently only works for axis=1")

    @derived_from(pd.DataFrame)
    def get(self, key, default=None):
        try:
            return self._pd_getitem(key)
        except (KeyError, ValueError, IndexError):
            return default

    def sort_values(self, by):
        return DataFrame(self._sdf.sort(by), self._metadata.copy())

    def groupby(self, by):
        from databricks.koalas.groups import PandasLikeGroupBy
        gp = self._sdf.groupby(by)
        return PandasLikeGroupBy(self, gp, None)

    @derived_from(pd.DataFrame)
    def pipe(self, func, *args, **kwargs):
        # Taken from pandas:
        # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707
        if isinstance(func, tuple):
            func, target = func
            if target in kwargs:
                raise ValueError('%s is both the pipe target and a keyword '
                                 'argument' % target)
            kwargs[target] = self
            return func(*args, **kwargs)
        else:
            return func(self, *args, **kwargs)

    @property
    def shape(self):
        return len(self), len(self.columns)

    def _pd_getitem(self, key):
        from databricks.koalas.series import Series
        if key is None:
            raise KeyError("none key")
        if isinstance(key, string_types):
            try:
                return Series(self._sdf.__getitem__(key), self,
                              self._metadata.index_info)
            except AnalysisException:
                raise KeyError(key)
        if np.isscalar(key) or isinstance(key, (tuple, string_types)):
            raise NotImplementedError(key)
        elif isinstance(key, slice):
            return self.loc[key]

        if isinstance(key, (pd.Series, np.ndarray, pd.Index)):
            raise NotImplementedError(key)
        if isinstance(key, list):
            return self.loc[:, key]
        if isinstance(key, DataFrame):
            # TODO Should not implement alignment, too dangerous?
            return Series(self._sdf.__getitem__(key), self,
                          self._metadata.index_info)
        if isinstance(key, Series):
            # TODO Should not implement alignment, too dangerous?
            # It is assumed to be only a filter, otherwise .loc should be used.
            bcol = key._scol.cast("boolean")
            return DataFrame(self._sdf.filter(bcol), self._metadata.copy())
        raise NotImplementedError(key)

    def __getitem__(self, key):
        return self._pd_getitem(key)

    def __setitem__(self, key, value):
        from databricks.koalas.series import Series
        # For now, we don't support realignment against different dataframes.
        # This is too expensive in Spark.
        # Are we assigning against a column?
        if isinstance(value, Series):
            assert value._kdf is self, \
                "Cannot combine column argument because it comes from a different dataframe"
        if isinstance(key, (tuple, list)):
            assert isinstance(value.schema, StructType)
            field_names = value.schema.fieldNames()
            kdf = self.assign(
                **{k: value[c]
                   for k, c in zip(key, field_names)})
        else:
            kdf = self.assign(**{key: value})

        self._sdf = kdf._sdf
        self._metadata = kdf._metadata

    def __getattr__(self, key):
        from databricks.koalas.series import Series
        if key.startswith("__") or key.startswith(
                "_pandas_") or key.startswith("_spark_"):
            raise AttributeError(key)
        if hasattr(_MissingPandasLikeDataFrame, key):
            return partial(getattr(_MissingPandasLikeDataFrame, key), self)
        return Series(self._sdf.__getattr__(key), self,
                      self._metadata.index_info)

    def __iter__(self):
        return self.toPandas().__iter__()

    def __len__(self):
        return self._sdf.count()

    def __dir__(self):
        fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f]
        return super(DataFrame, self).__dir__() + fields

    def _repr_html_(self):
        return self.head(max_display_count).toPandas()._repr_html_()

    @classmethod
    def _validate_axis(cls, axis=0):
        if axis not in (0, 1, 'index', 'columns', None):
            raise ValueError('No axis named {0}'.format(axis))
        # convert to numeric axis
        return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
Ejemplo n.º 16
0
 def _metadata(self):
     if not hasattr(self,
                    '_pandas_metadata') or self._pandas_metadata is None:
         self._pandas_metadata = Metadata(
             column_fields=self.schema.fieldNames())
     return self._pandas_metadata