Ejemplo n.º 1
0
    def _pd_getitem(self, key):
        from databricks.koalas.series import Series
        if key is None:
            raise KeyError("none key")
        if isinstance(key, string_types):
            try:
                return Series(self._sdf.__getitem__(key), self,
                              self._metadata.index_info)
            except AnalysisException:
                raise KeyError(key)
        if np.isscalar(key) or isinstance(key, (tuple, string_types)):
            raise NotImplementedError(key)
        elif isinstance(key, slice):
            return self.loc[key]

        if isinstance(key, (pd.Series, np.ndarray, pd.Index)):
            raise NotImplementedError(key)
        if isinstance(key, list):
            return self.loc[:, key]
        if isinstance(key, DataFrame):
            # TODO Should not implement alignment, too dangerous?
            return Series(self._sdf.__getitem__(key), self,
                          self._metadata.index_info)
        if isinstance(key, Series):
            # TODO Should not implement alignment, too dangerous?
            # It is assumed to be only a filter, otherwise .loc should be used.
            bcol = key._scol.cast("boolean")
            return DataFrame(self._sdf.filter(bcol), self._metadata.copy())
        raise NotImplementedError(key)
Ejemplo n.º 2
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond, limit = self._select_rows(rows_sel)
        column_index, columns, returns_series = self._select_cols(cols_sel)

        if cond is None and limit is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)
                if limit is not None:
                    if limit >= 0:
                        sdf = sdf.limit(limit)
                    else:
                        sdf = sdf.limit(sdf.count() + limit)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf
Ejemplo n.º 3
0
 def __getattr__(self, key):
     from databricks.koalas.series import Series
     if key.startswith("__") or key.startswith("_pandas_") or key.startswith("_spark_"):
         raise AttributeError(key)
     if hasattr(_MissingPandasLikeDataFrame, key):
         return partial(getattr(_MissingPandasLikeDataFrame, key), self)
     return Series(self._sdf.__getattr__(key), self, self._metadata.index_info)
Ejemplo n.º 4
0
    def to_series(self, name: str = None) -> Series:
        """
        Create a Series with both index and values equal to the index keys
        useful with map for returning an indexer based on an index.

        Parameters
        ----------
        name : string, optional
            name of resulting Series. If None, defaults to name of original
            index

        Returns
        -------
        Series : dtype will be based on the type of the Index values.

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'],
        ...                   index=list('abcd'))
        >>> df['dogs'].index.to_series()
        a    a
        b    b
        c    c
        d    d
        Name: __index_level_0__, dtype: object
        """
        kdf = self._kdf
        scol = self._scol
        return Series(scol if name is None else scol.alias(name),
                      anchor=kdf,
                      index=kdf._metadata.index_map)
Ejemplo n.º 5
0
    def _transform_batch(self, func, return_schema):
        from databricks.koalas.series import Series
        from databricks import koalas as ks

        if not isinstance(func, types.FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_schema is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ks.get_option("compute.shortcut_limit")
            pser = self._kser.head(limit)._to_internal_pandas()
            transformed = pser.transform(func)
            kser = Series(transformed)
            spark_return_type = kser.spark.data_type
        else:
            spark_return_type = return_schema

        pudf = pandas_udf(func,
                          returnType=spark_return_type,
                          functionType=PandasUDFType.SCALAR)
        return self._kser._with_new_scol(scol=pudf(self._kser.spark.column))
Ejemplo n.º 6
0
def from_pandas(
        pobj: Union['pd.DataFrame',
                    'pd.Series']) -> Union['Series', 'DataFrame']:
    """Create a Koalas DataFrame or Series from a pandas DataFrame or Series.

    This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame,
    but this also works with pandas Series and picks the index.

    Parameters
    ----------
    pobj : pandas.DataFrame or pandas.Series
        pandas DataFrame or Series to read.

    Returns
    -------
    Series or DataFrame
        If a pandas Series is passed in, this function returns a Koalas Series.
        If a pandas DataFrame is passed in, this function returns a Koalas DataFrame.
    """
    if isinstance(pobj, pd.Series):
        return Series(pobj)
    elif isinstance(pobj, pd.DataFrame):
        return DataFrame(pobj)
    else:
        raise ValueError("Unknown data type: {}".format(type(pobj)))
Ejemplo n.º 7
0
    def index(self):
        """The index (row labels) Column of the DataFrame.

        Currently supported only when the DataFrame has a single index.
        """
        from databricks.koalas.series import Series
        if len(self._metadata.index_info) != 1:
            raise KeyError('Currently supported only when the DataFrame has a single index.')
        return Series(self._index_columns[0], self, [])
Ejemplo n.º 8
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series, _col

        if (not isinstance(key, tuple)) or (len(key) != 2):
            raise SparkPandasNotImplementedError(
                description="Only accepts pairs of candidates",
                pandas_function=".loc[..., ...] = ...",
                spark_target_function="withColumn, select")

        rows_sel, cols_sel = key

        if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
            if isinstance(rows_sel, list):
                if isinstance(cols_sel, str):
                    cols_sel = [cols_sel]
                kdf = self._kdf.copy()
                for col_sel in cols_sel:
                    # Uses `kdf` to allow operations on different DataFrames.
                    # TODO: avoid temp column name or declare `__` prefix is
                    #  reserved for Koalas' internal columns.
                    kdf["__indexing_temp_col__"] = value
                    new_col = kdf["__indexing_temp_col__"]._scol
                    kdf[col_sel] = Series(
                        kdf[col_sel]._internal.copy(scol=F.when(
                            kdf._internal.index_scols[0].isin(rows_sel),
                            new_col).otherwise(kdf[col_sel]._scol)),
                        anchor=kdf)
                    kdf = kdf.drop(labels=['__indexing_temp_col__'])

                self._kdf._internal = kdf._internal.copy()
            else:
                raise SparkPandasNotImplementedError(
                    description=
                    """Can only assign value to the whole dataframe, the row index
                    has to be `slice(None)` or `:`""",
                    pandas_function=".loc[..., ...] = ...",
                    spark_target_function="withColumn, select")

        if not isinstance(cols_sel, (str, list)):
            raise ValueError(
                """only column names or list of column names can be assigned"""
            )

        if isinstance(value, DataFrame):
            if len(value.columns) == 1:
                self._kdf[cols_sel] = _col(value)
            else:
                raise ValueError(
                    "Only a dataframe with one column can be assigned")
        else:
            if isinstance(cols_sel, str):
                cols_sel = [cols_sel]
            if (not isinstance(rows_sel, list)) and (isinstance(
                    cols_sel, list)):
                for col_sel in cols_sel:
                    self._kdf[col_sel] = value
Ejemplo n.º 9
0
    def _transform_batch(self, func, return_type: Optional[Union[SeriesType, ScalarType]]):
        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.series import Series, first_series
        from databricks import koalas as ks

        if not isinstance(func, types.FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_type is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ks.get_option("compute.shortcut_limit")
            pser = self._kser.head(limit + 1)._to_internal_pandas()
            transformed = pser.transform(func)
            kser = Series(transformed)  # type: Series
            spark_return_type = force_decimal_precision_scale(
                as_nullable_spark_type(kser.spark.data_type)
            )
            dtype = kser.dtype
        else:
            spark_return_type = return_type.spark_type
            dtype = return_type.dtype

        kdf = self._kser.to_frame()
        columns = kdf._internal.spark_column_names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = columns
            return pdf

        def apply_func(pdf):
            return func(first_series(pdf)).to_frame()

        return_schema = StructType([StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)])
        output_func = GroupBy._make_pandas_df_builder_func(
            kdf, apply_func, return_schema, retain_index=False
        )

        pudf = pandas_udf(
            lambda *series: first_series(output_func(pandas_concat(series))),
            returnType=spark_return_type,
            functionType=PandasUDFType.SCALAR,
        )

        return self._kser._with_new_scol(
            scol=pudf(*kdf._internal.spark_columns).alias(
                self._kser._internal.spark_column_names[0]
            ),
            dtype=dtype,
        )
Ejemplo n.º 10
0
def from_pandas(pdf):
    """Create DataFrame from pandas DataFrame.

    This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks
    the index in the given pandas DataFrame.

    :param pdf: :class:`pandas.DataFrame`
    """
    if isinstance(pdf, pd.Series):
        return Series(pdf)
    elif isinstance(pdf, pd.DataFrame):
        return DataFrame(pdf)
    else:
        raise ValueError("Unknown data type: {}".format(type(pdf)))
Ejemplo n.º 11
0
def _spark_col_apply(kdf_or_ks, sfun):
    """
    Performs a function to all cells on a dataframe, the function being a known sql function.
    """
    from databricks.koalas.frame import DataFrame
    from databricks.koalas.series import Series
    if isinstance(kdf_or_ks, Series):
        ks = kdf_or_ks
        return Series(ks._kdf._internal.copy(scol=sfun(kdf_or_ks._scol)), anchor=ks._kdf)
    assert isinstance(kdf_or_ks, DataFrame)
    kdf = kdf_or_ks
    sdf = kdf._sdf
    sdf = sdf.select([sfun(sdf[col]).alias(col) for col in kdf.columns])
    return DataFrame(sdf)
Ejemplo n.º 12
0
def to_datetime(arg, errors='raise', format=None, infer_datetime_format=False):
    if isinstance(arg, Series):
        return Series(_to_datetime1(
            arg._scol,
            errors=errors,
            format=format,
            infer_datetime_format=infer_datetime_format), arg._kdf, arg._index_info)
    if isinstance(arg, DataFrame):
        return Series(_to_datetime2(
            arg_year=arg['year']._scol,
            arg_month=arg['month']._scol,
            arg_day=arg['day']._scol,
            errors=errors,
            format=format,
            infer_datetime_format=infer_datetime_format), arg, arg._metadata.index_info)
    if isinstance(arg, dict):
        return _to_datetime2(
            arg_year=arg['year'],
            arg_month=arg['month'],
            arg_day=arg['day'],
            errors=errors,
            format=format,
            infer_datetime_format=infer_datetime_format)
Ejemplo n.º 13
0
def align_diff_series(func, this_series, *args, how="full"):
    from databricks.koalas.base import IndexOpsMixin
    from databricks.koalas.series import Series

    cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)]
    combined = combine_frames(this_series.to_frame(), *cols, how=how)

    scol = func(combined["this"]._internal.data_spark_columns[0],
                *combined["that"]._internal.data_spark_columns)

    return Series(
        combined._internal.copy(
            spark_column=scol,
            column_labels=this_series._internal.column_labels),
        anchor=combined,
    )
Ejemplo n.º 14
0
def _auto_patch_pandas():
    import pandas as pd

    # In order to use it in test cases.
    global _frame_has_class_getitem
    global _series_has_class_getitem

    _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__")
    _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__")

    if sys.version_info >= (3, 7):
        # Just in case pandas implements '__class_getitem__' later.
        if not _frame_has_class_getitem:
            pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params)

        if not _series_has_class_getitem:
            pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)
Ejemplo n.º 15
0
    def to_series(self, name: Union[str, Tuple[str, ...]] = None) -> Series:
        """
        Create a Series with both index and values equal to the index keys
        useful with map for returning an indexer based on an index.

        Parameters
        ----------
        name : string, optional
            name of resulting Series. If None, defaults to name of original
            index

        Returns
        -------
        Series : dtype will be based on the type of the Index values.

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'],
        ...                   index=list('abcd'))
        >>> df['dogs'].index.to_series()
        a    a
        b    b
        c    c
        d    d
        Name: 0, dtype: object
        """
        kdf = self._kdf
        scol = self._scol
        if name is not None:
            scol = scol.alias(name_like_string(name))
        column_index = [
            None
        ] if len(kdf._internal.index_map) > 1 else kdf._internal.index_names
        return Series(kdf._internal.copy(scol=scol,
                                         column_index=column_index,
                                         column_index_names=None),
                      anchor=kdf)
Ejemplo n.º 16
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(
                    self._internal.copy(scol=column_scols[0],
                                        column_labels=[column_labels[0]]),
                    anchor=self._kdf_or_kser,
                )

        if remaining_index is not None:
            index_scols = self._internal.index_scols[-remaining_index:]
            index_map = self._internal.index_map[-remaining_index:]
        else:
            index_scols = self._internal.index_scols
            index_map = self._internal.index_map

        if self._internal.column_label_names is None:
            column_label_names = None
        else:
            # Manage column index names
            level = column_labels_level(column_labels)
            column_label_names = self._internal.column_label_names[-level:]

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(index_scols + column_scols)
        except AnalysisException:
            raise KeyError("[{}] don't exist in columns".format(
                [col._jc.toString() for col in column_scols]))

        internal = _InternalFrame(
            sdf=sdf,
            index_map=index_map,
            column_labels=column_labels,
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
Ejemplo n.º 17
0
def _make_fun(f: typing.Callable, return_type: types.DataType, *args,
              **kwargs) -> 'ks.Series':
    """
    This function calls the function f while taking into account some of the
    limitations of the pandas UDF support:
    - support for keyword arguments
    - support for scalar values (as long as they are picklable)
    - support for type hints and input checks.
    :param f: the function to call. It is expected to have field annotations (see below).
    :param return_sig: the return type
    :param args: the arguments of the function
    :param kwargs: the kwargs to pass to the function
    :return: the value of executing the function: f(*args, **kwargs)

    The way this function executes depends on the what is provided as arguments:
     - if one of the arguments is a koalas series or dataframe:
        - the function is wrapped as a Spark UDF
        - the series arguments are checked to be coming from the same original anchor
        - the non-series arguments are serialized into the spark UDF.

    The function is expected to have the following arguments:
    """
    from databricks.koalas.series import Series
    # All the arguments.
    # None for columns or the value for non-columns
    frozen_args = []  # type: typing.List[typing.Any]
    # ks.Series for columns or None for the non-columns
    col_args = []  # type: typing.List[typing.Optional[Series]]
    for arg in args:
        if isinstance(arg, Series):
            frozen_args.append(None)
            col_args.append(arg)
        elif isinstance(arg, Column):
            raise ValueError('A pyspark column was passed as an argument.'
                             ' Pass a koalas series instead')
        else:
            frozen_args.append(arg)
            col_args.append(None)

    # Value is none for kwargs that are columns, and the value otherwise
    frozen_kwargs = []  # type: typing.List[typing.Tuple[str, typing.Any]]
    # Value is a spark col for kwarg that is column, and None otherwise
    col_kwargs = []  # type: typing.List[typing.Tuple[str, Series]]
    for (key, arg) in kwargs.items():
        if isinstance(arg, Series):
            col_kwargs.append((key, arg))
        elif isinstance(arg, Column):
            raise ValueError('A pyspark column was passed as an argument.'
                             ' Pass a koalas series instead')
        else:
            frozen_kwargs.append((key, arg))

    col_args_idxs = [idx for (idx, c) in enumerate(col_args) if c is not None]
    all_indexes = (col_args_idxs + [key for (key, _) in col_kwargs]
                   )  # type: ignore
    if not all_indexes:
        # No argument is related to spark
        # The function is just called through without other considerations.
        return f(*args, **kwargs)

    # We detected some columns. They need to be wrapped in a UDF to spark.
    kdf = _get_kdf(args, kwargs)

    def clean_fun(*args2):
        assert len(args2) == len(all_indexes), \
            "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2])
        full_args = list(frozen_args)
        full_kwargs = dict(frozen_kwargs)
        for (arg, idx) in zip(args2, all_indexes):
            if isinstance(idx, int):
                full_args[idx] = arg
            else:
                assert isinstance(idx, str), str(idx)
                full_kwargs[idx] = arg
        return f(*full_args, **full_kwargs)

    wrapped_udf = pandas_udf(clean_fun, returnType=return_type)
    name_tokens = []
    spark_col_args = []
    for col in col_args:
        if col is not None:
            spark_col_args.append(col._scol)
            name_tokens.append(col.name)
    kw_name_tokens = []
    for (key, col) in col_kwargs:
        spark_col_args.append(col._scol)
        kw_name_tokens.append("{}={}".format(key, col.name))
    col = wrapped_udf(*spark_col_args)
    series = Series(kdf._internal.copy(scol=col), anchor=kdf)
    all_name_tokens = name_tokens + sorted(kw_name_tokens)
    name = "{}({})".format(f.__name__, ", ".join(all_name_tokens))
    series = series.astype(return_type).alias(name)
    return series
Ejemplo n.º 18
0
 def _rank(self, *args, **kwargs):
     groupkey_scols = [s._scol for s in self._groupkeys]
     return Series._rank(self._ks,
                         *args,
                         **kwargs,
                         part_cols=groupkey_scols)
Ejemplo n.º 19
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, tuple):
                if len(key) > 1:
                    raise SparkPandasIndexingError('Too many indexers')
                key = key[0]

            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf['__temp_col__'] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf['__temp_col__']]

            cond, limit = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_index = self._internal.column_index
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf['__temp_col__'] = rows_sel
                return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit = self._select_rows(rows_sel)
            column_index, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(self._internal.copy(
                    scol=column_scols[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(self._internal.index_scols + column_scols)

            if self._internal.column_index_names is None:
                column_index_names = None
            else:
                # Manage column index names
                level = column_index_level(column_index)
                column_index_names = self._internal.column_index_names[-level:]

            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._internal.index_map,
                                      column_index=column_index,
                                      column_index_names=column_index_names)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in column_scols]))

        if returns_series:
            return Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            return kdf
Ejemplo n.º 20
0
def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) -> 'ks.Series':
    """
    This function calls the function f while taking into account some of the
    limitations of the pandas UDF support:
    - support for keyword arguments
    - support for scalar values (as long as they are picklable)
    - support for type hints and input checks.
    :param f: the function to call. It is expected to have field annotations (see below).
    :param return_sig: the return type
    :param args: the arguments of the function
    :param kwargs: the kwargs to pass to the function
    :return: the value of executing the function: f(*args, **kwargs)

    The way this function executes depends on the what is provided as arguments:
     - if one of the arguments is a koalas series or dataframe:
        - the function is wrapped as a Spark UDF
        - the series arguments are checked to be coming from the same original anchor
        - the non-series arguments are serialized into the spark UDF.

    The function is expected to have the following arguments:
    """
    from databricks.koalas.series import Series
    # All the arguments.
    # None for columns or the value for non-columns
    frozen_args = []  # type: typing.List[typing.Any]
    # ks.Series for columns or None for the non-columns
    col_args = []  # type: typing.List[typing.Optional[Series]]
    for arg in args:
        if isinstance(arg, Series):
            frozen_args.append(None)
            col_args.append(arg)
        elif isinstance(arg, Column):
            raise ValueError('A pyspark column was passed as an argument.'
                             ' Pass a koalas series instead')
        else:
            frozen_args.append(arg)
            col_args.append(None)

    # Value is none for kwargs that are columns, and the value otherwise
    frozen_kwargs = []  # type: typing.List[typing.Tuple[str, typing.Any]]
    # Value is a spark col for kwarg that is column, and None otherwise
    col_kwargs = []  # type: typing.List[typing.Tuple[str, Series]]
    for (key, arg) in kwargs.items():
        if isinstance(arg, Series):
            col_kwargs.append((key, arg))
        elif isinstance(arg, Column):
            raise ValueError('A pyspark column was passed as an argument.'
                             ' Pass a koalas series instead')
        else:
            frozen_kwargs.append((key, arg))

    col_args_idxs = [idx for (idx, c) in enumerate(col_args) if c is not None]
    all_indexes = (col_args_idxs + [key for (key, _) in col_kwargs])  # type: ignore
    if not all_indexes:
        # No argument is related to spark
        # The function is just called through without other considerations.
        return f(*args, **kwargs)

    # We detected some columns. They need to be wrapped in a UDF to spark.
    (index_map, kdf) = _get_metadata(args, kwargs)

    def clean_fun(*args2):
        assert len(args2) == len(all_indexes), \
            "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2])
        full_args = list(frozen_args)
        full_kwargs = dict(frozen_kwargs)
        for (arg, idx) in zip(args2, all_indexes):
            if isinstance(idx, int):
                full_args[idx] = arg
            else:
                assert isinstance(idx, str), str(idx)
                full_kwargs[idx] = arg
        return f(*full_args, **full_kwargs)

    wrapped_udf = pandas_udf(clean_fun, returnType=return_type)
    name_tokens = []
    spark_col_args = []
    for col in col_args:
        if col is not None:
            spark_col_args.append(col._scol)
            name_tokens.append(col.name)
    kw_name_tokens = []
    for (key, col) in col_kwargs:
        spark_col_args.append(col._scol)
        kw_name_tokens.append("{}={}".format(key, col.name))
    col = wrapped_udf(*spark_col_args)
    series = Series(data=col, index=index_map, anchor=kdf)
    all_name_tokens = name_tokens + sorted(kw_name_tokens)
    name = "{}({})".format(f.__name__, ", ".join(all_name_tokens))
    series = series.astype(return_type).alias(name)
    return series
Ejemplo n.º 21
0
 def _cum(self, func):
     groupkey_scols = [s._scol for s in self._groupkeys]
     return Series._cum(self._ks, func, True, part_cols=groupkey_scols)
Ejemplo n.º 22
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            data_spark_columns = self._internal.data_spark_columns
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError("Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][
                    list(self._kdf_or_kser.columns)
                ]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel)

            if cond is None and limit is None and returns_series:
                return self._kdf_or_kser._kser_for(column_labels[0])

        if remaining_index is not None:
            index_scols = self._internal.index_spark_columns[-remaining_index:]
            index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:])
        else:
            index_scols = self._internal.index_spark_columns
            index_map = self._internal.index_map

        if len(column_labels) > 0:
            column_labels = column_labels.copy()
            column_labels_level = max(
                len(label) if label is not None else 1 for label in column_labels
            )
            none_column = 0
            for i, label in enumerate(column_labels):
                if label is None:
                    label = (str(none_column),)
                    none_column += 1
                if len(label) < column_labels_level:
                    label = tuple(list(label) + ([""]) * (column_labels_level - len(label)))
                column_labels[i] = label

            if self._internal.column_label_names is None:
                column_label_names = None
            else:
                # Manage column index names
                column_label_names = self._internal.column_label_names[-column_labels_level:]
        else:
            column_label_names = None

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            data_columns = sdf.select(data_spark_columns).columns
            sdf = sdf.select(index_scols + data_spark_columns)
        except AnalysisException:
            raise KeyError(
                "[{}] don't exist in columns".format(
                    [col._jc.toString() for col in data_spark_columns]
                )
            )

        internal = _InternalFrame(
            spark_frame=sdf,
            index_map=index_map,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf
            )
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
Ejemplo n.º 23
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None)

        cond = self._select_rows(rows_sel)

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise LocIndexer._raiseNotImplemented(
                "Can only select columns either by name or reference or all")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None

        returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column)
        if cols_sel is None:
            column_index = self._internal.column_index
            columns = self._internal.column_scols
        elif isinstance(cols_sel, (str, tuple)):
            if isinstance(cols_sel, str):
                cols_sel = (cols_sel,)
            column_index, columns, returns_series = \
                self._get_from_multiindex_column(cols_sel)
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
            column_index = None
        elif all(isinstance(key, Series) for key in cols_sel):
            columns = [_make_col(key) for key in cols_sel]
            column_index = [key._internal.column_index[0] for key in cols_sel]
        elif all(isinstance(key, spark.Column) for key in cols_sel):
            columns = cols_sel
            column_index = None
        elif (any(isinstance(key, str) for key in cols_sel)
              and any(isinstance(key, tuple) for key in cols_sel)):
            raise TypeError('Expected tuple, got str')
        else:
            if all(isinstance(key, tuple) for key in cols_sel):
                level = self._internal.column_index_level
                if any(len(key) != level for key in cols_sel):
                    raise ValueError('All the key level should be the same as column index level.')

            column_to_index = list(zip(self._internal.data_columns,
                                       self._internal.column_index))
            columns = []
            column_index = []
            for key in cols_sel:
                found = False
                for column, idx in column_to_index:
                    if idx == key or idx[0] == key:
                        columns.append(_make_col(column))
                        column_index.append(idx)
                        found = True
                if not found:
                    raise KeyError("['{}'] not in index".format(key))

        if cond is None and returns_series:
            if self._is_series:
                return self._kdf_or_kser._with_new_scol(columns[0])
            else:
                return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)
        else:
            try:
                sdf = self._internal._sdf
                if cond is not None:
                    sdf = sdf.where(cond)

                sdf = sdf.select(self._internal.index_scols + columns)

                if self._internal.column_index_names is None:
                    column_index_names = None
                else:
                    # Manage column index names
                    level = column_index_level(column_index)
                    column_index_names = self._internal.column_index_names[-level:]

                internal = _InternalFrame(sdf=sdf,
                                          index_map=self._internal.index_map,
                                          column_index=column_index,
                                          column_index_names=column_index_names)
                kdf = DataFrame(internal)
            except AnalysisException:
                raise KeyError('[{}] don\'t exist in columns'
                               .format([col._jc.toString() for col in columns]))

            if returns_series:
                return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                              anchor=kdf)
            else:
                return kdf