def _pd_getitem(self, key): from databricks.koalas.series import Series if key is None: raise KeyError("none key") if isinstance(key, string_types): try: return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) except AnalysisException: raise KeyError(key) if np.isscalar(key) or isinstance(key, (tuple, string_types)): raise NotImplementedError(key) elif isinstance(key, slice): return self.loc[key] if isinstance(key, (pd.Series, np.ndarray, pd.Index)): raise NotImplementedError(key) if isinstance(key, list): return self.loc[:, key] if isinstance(key, DataFrame): # TODO Should not implement alignment, too dangerous? return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) if isinstance(key, Series): # TODO Should not implement alignment, too dangerous? # It is assumed to be only a filter, otherwise .loc should be used. bcol = key._scol.cast("boolean") return DataFrame(self._sdf.filter(bcol), self._metadata.copy()) raise NotImplementedError(key)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond, limit = self._select_rows(rows_sel) column_index, columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def __getattr__(self, key): from databricks.koalas.series import Series if key.startswith("__") or key.startswith("_pandas_") or key.startswith("_spark_"): raise AttributeError(key) if hasattr(_MissingPandasLikeDataFrame, key): return partial(getattr(_MissingPandasLikeDataFrame, key), self) return Series(self._sdf.__getattr__(key), self, self._metadata.index_info)
def to_series(self, name: str = None) -> Series: """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. Parameters ---------- name : string, optional name of resulting Series. If None, defaults to name of original index Returns ------- Series : dtype will be based on the type of the Index values. Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats'], ... index=list('abcd')) >>> df['dogs'].index.to_series() a a b b c c d d Name: __index_level_0__, dtype: object """ kdf = self._kdf scol = self._scol return Series(scol if name is None else scol.alias(name), anchor=kdf, index=kdf._metadata.index_map)
def _transform_batch(self, func, return_schema): from databricks.koalas.series import Series from databricks import koalas as ks if not isinstance(func, types.FunctionType): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_schema is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ks.get_option("compute.shortcut_limit") pser = self._kser.head(limit)._to_internal_pandas() transformed = pser.transform(func) kser = Series(transformed) spark_return_type = kser.spark.data_type else: spark_return_type = return_schema pudf = pandas_udf(func, returnType=spark_return_type, functionType=PandasUDFType.SCALAR) return self._kser._with_new_scol(scol=pudf(self._kser.spark.column))
def from_pandas( pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']: """Create a Koalas DataFrame or Series from a pandas DataFrame or Series. This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame, but this also works with pandas Series and picks the index. Parameters ---------- pobj : pandas.DataFrame or pandas.Series pandas DataFrame or Series to read. Returns ------- Series or DataFrame If a pandas Series is passed in, this function returns a Koalas Series. If a pandas DataFrame is passed in, this function returns a Koalas DataFrame. """ if isinstance(pobj, pd.Series): return Series(pobj) elif isinstance(pobj, pd.DataFrame): return DataFrame(pobj) else: raise ValueError("Unknown data type: {}".format(type(pobj)))
def index(self): """The index (row labels) Column of the DataFrame. Currently supported only when the DataFrame has a single index. """ from databricks.koalas.series import Series if len(self._metadata.index_info) != 1: raise KeyError('Currently supported only when the DataFrame has a single index.') return Series(self._index_columns[0], self, [])
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series, _col if (not isinstance(key, tuple)) or (len(key) != 2): raise SparkPandasNotImplementedError( description="Only accepts pairs of candidates", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") rows_sel, cols_sel = key if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)): if isinstance(rows_sel, list): if isinstance(cols_sel, str): cols_sel = [cols_sel] kdf = self._kdf.copy() for col_sel in cols_sel: # Uses `kdf` to allow operations on different DataFrames. # TODO: avoid temp column name or declare `__` prefix is # reserved for Koalas' internal columns. kdf["__indexing_temp_col__"] = value new_col = kdf["__indexing_temp_col__"]._scol kdf[col_sel] = Series( kdf[col_sel]._internal.copy(scol=F.when( kdf._internal.index_scols[0].isin(rows_sel), new_col).otherwise(kdf[col_sel]._scol)), anchor=kdf) kdf = kdf.drop(labels=['__indexing_temp_col__']) self._kdf._internal = kdf._internal.copy() else: raise SparkPandasNotImplementedError( description= """Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") if not isinstance(cols_sel, (str, list)): raise ValueError( """only column names or list of column names can be assigned""" ) if isinstance(value, DataFrame): if len(value.columns) == 1: self._kdf[cols_sel] = _col(value) else: raise ValueError( "Only a dataframe with one column can be assigned") else: if isinstance(cols_sel, str): cols_sel = [cols_sel] if (not isinstance(rows_sel, list)) and (isinstance( cols_sel, list)): for col_sel in cols_sel: self._kdf[col_sel] = value
def _transform_batch(self, func, return_type: Optional[Union[SeriesType, ScalarType]]): from databricks.koalas.groupby import GroupBy from databricks.koalas.series import Series, first_series from databricks import koalas as ks if not isinstance(func, types.FunctionType): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_type is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ks.get_option("compute.shortcut_limit") pser = self._kser.head(limit + 1)._to_internal_pandas() transformed = pser.transform(func) kser = Series(transformed) # type: Series spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(kser.spark.data_type) ) dtype = kser.dtype else: spark_return_type = return_type.spark_type dtype = return_type.dtype kdf = self._kser.to_frame() columns = kdf._internal.spark_column_names def pandas_concat(series): # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf def apply_func(pdf): return func(first_series(pdf)).to_frame() return_schema = StructType([StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]) output_func = GroupBy._make_pandas_df_builder_func( kdf, apply_func, return_schema, retain_index=False ) pudf = pandas_udf( lambda *series: first_series(output_func(pandas_concat(series))), returnType=spark_return_type, functionType=PandasUDFType.SCALAR, ) return self._kser._with_new_scol( scol=pudf(*kdf._internal.spark_columns).alias( self._kser._internal.spark_column_names[0] ), dtype=dtype, )
def from_pandas(pdf): """Create DataFrame from pandas DataFrame. This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks the index in the given pandas DataFrame. :param pdf: :class:`pandas.DataFrame` """ if isinstance(pdf, pd.Series): return Series(pdf) elif isinstance(pdf, pd.DataFrame): return DataFrame(pdf) else: raise ValueError("Unknown data type: {}".format(type(pdf)))
def _spark_col_apply(kdf_or_ks, sfun): """ Performs a function to all cells on a dataframe, the function being a known sql function. """ from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if isinstance(kdf_or_ks, Series): ks = kdf_or_ks return Series(ks._kdf._internal.copy(scol=sfun(kdf_or_ks._scol)), anchor=ks._kdf) assert isinstance(kdf_or_ks, DataFrame) kdf = kdf_or_ks sdf = kdf._sdf sdf = sdf.select([sfun(sdf[col]).alias(col) for col in kdf.columns]) return DataFrame(sdf)
def to_datetime(arg, errors='raise', format=None, infer_datetime_format=False): if isinstance(arg, Series): return Series(_to_datetime1( arg._scol, errors=errors, format=format, infer_datetime_format=infer_datetime_format), arg._kdf, arg._index_info) if isinstance(arg, DataFrame): return Series(_to_datetime2( arg_year=arg['year']._scol, arg_month=arg['month']._scol, arg_day=arg['day']._scol, errors=errors, format=format, infer_datetime_format=infer_datetime_format), arg, arg._metadata.index_info) if isinstance(arg, dict): return _to_datetime2( arg_year=arg['year'], arg_month=arg['month'], arg_day=arg['day'], errors=errors, format=format, infer_datetime_format=infer_datetime_format)
def align_diff_series(func, this_series, *args, how="full"): from databricks.koalas.base import IndexOpsMixin from databricks.koalas.series import Series cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)] combined = combine_frames(this_series.to_frame(), *cols, how=how) scol = func(combined["this"]._internal.data_spark_columns[0], *combined["that"]._internal.data_spark_columns) return Series( combined._internal.copy( spark_column=scol, column_labels=this_series._internal.column_labels), anchor=combined, )
def _auto_patch_pandas(): import pandas as pd # In order to use it in test cases. global _frame_has_class_getitem global _series_has_class_getitem _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__") _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__") if sys.version_info >= (3, 7): # Just in case pandas implements '__class_getitem__' later. if not _frame_has_class_getitem: pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params) if not _series_has_class_getitem: pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)
def to_series(self, name: Union[str, Tuple[str, ...]] = None) -> Series: """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. Parameters ---------- name : string, optional name of resulting Series. If None, defaults to name of original index Returns ------- Series : dtype will be based on the type of the Index values. Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats'], ... index=list('abcd')) >>> df['dogs'].index.to_series() a a b b c c d d Name: 0, dtype: object """ kdf = self._kdf scol = self._scol if name is not None: scol = scol.alias(name_like_string(name)) column_index = [ None ] if len(kdf._internal.index_map) > 1 else kdf._internal.index_names return Series(kdf._internal.copy(scol=scol, column_index=column_index, column_index_names=None), anchor=kdf)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list( self._kdf_or_kser.columns)] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series( self._internal.copy(scol=column_scols[0], column_labels=[column_labels[0]]), anchor=self._kdf_or_kser, ) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] index_map = self._internal.index_map[-remaining_index:] else: index_scols = self._internal.index_scols index_map = self._internal.index_map if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names level = column_labels_level(column_labels) column_label_names = self._internal.column_label_names[-level:] try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(index_scols + column_scols) except AnalysisException: raise KeyError("[{}] don't exist in columns".format( [col._jc.toString() for col in column_scols])) internal = _InternalFrame( sdf=sdf, index_map=index_map, column_labels=column_labels, column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) -> 'ks.Series': """ This function calls the function f while taking into account some of the limitations of the pandas UDF support: - support for keyword arguments - support for scalar values (as long as they are picklable) - support for type hints and input checks. :param f: the function to call. It is expected to have field annotations (see below). :param return_sig: the return type :param args: the arguments of the function :param kwargs: the kwargs to pass to the function :return: the value of executing the function: f(*args, **kwargs) The way this function executes depends on the what is provided as arguments: - if one of the arguments is a koalas series or dataframe: - the function is wrapped as a Spark UDF - the series arguments are checked to be coming from the same original anchor - the non-series arguments are serialized into the spark UDF. The function is expected to have the following arguments: """ from databricks.koalas.series import Series # All the arguments. # None for columns or the value for non-columns frozen_args = [] # type: typing.List[typing.Any] # ks.Series for columns or None for the non-columns col_args = [] # type: typing.List[typing.Optional[Series]] for arg in args: if isinstance(arg, Series): frozen_args.append(None) col_args.append(arg) elif isinstance(arg, Column): raise ValueError('A pyspark column was passed as an argument.' ' Pass a koalas series instead') else: frozen_args.append(arg) col_args.append(None) # Value is none for kwargs that are columns, and the value otherwise frozen_kwargs = [] # type: typing.List[typing.Tuple[str, typing.Any]] # Value is a spark col for kwarg that is column, and None otherwise col_kwargs = [] # type: typing.List[typing.Tuple[str, Series]] for (key, arg) in kwargs.items(): if isinstance(arg, Series): col_kwargs.append((key, arg)) elif isinstance(arg, Column): raise ValueError('A pyspark column was passed as an argument.' ' Pass a koalas series instead') else: frozen_kwargs.append((key, arg)) col_args_idxs = [idx for (idx, c) in enumerate(col_args) if c is not None] all_indexes = (col_args_idxs + [key for (key, _) in col_kwargs] ) # type: ignore if not all_indexes: # No argument is related to spark # The function is just called through without other considerations. return f(*args, **kwargs) # We detected some columns. They need to be wrapped in a UDF to spark. kdf = _get_kdf(args, kwargs) def clean_fun(*args2): assert len(args2) == len(all_indexes), \ "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2]) full_args = list(frozen_args) full_kwargs = dict(frozen_kwargs) for (arg, idx) in zip(args2, all_indexes): if isinstance(idx, int): full_args[idx] = arg else: assert isinstance(idx, str), str(idx) full_kwargs[idx] = arg return f(*full_args, **full_kwargs) wrapped_udf = pandas_udf(clean_fun, returnType=return_type) name_tokens = [] spark_col_args = [] for col in col_args: if col is not None: spark_col_args.append(col._scol) name_tokens.append(col.name) kw_name_tokens = [] for (key, col) in col_kwargs: spark_col_args.append(col._scol) kw_name_tokens.append("{}={}".format(key, col.name)) col = wrapped_udf(*spark_col_args) series = Series(kdf._internal.copy(scol=col), anchor=kdf) all_name_tokens = name_tokens + sorted(kw_name_tokens) name = "{}({})".format(f.__name__, ", ".join(all_name_tokens)) series = series.astype(return_type).alias(name) return series
def _rank(self, *args, **kwargs): groupkey_scols = [s._scol for s in self._groupkeys] return Series._rank(self._ks, *args, **kwargs, part_cols=groupkey_scols)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, tuple): if len(key) > 1: raise SparkPandasIndexingError('Too many indexers') key = key[0] if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf['__temp_col__'] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf['__temp_col__']] cond, limit = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_index = self._internal.column_index column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf['__temp_col__'] = rows_sel return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list( self._kdf_or_kser.columns)] cond, limit = self._select_rows(rows_sel) column_index, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series(self._internal.copy( scol=column_scols[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + column_scols) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in column_scols])) if returns_series: return Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def _make_fun(f: typing.Callable, return_type: types.DataType, *args, **kwargs) -> 'ks.Series': """ This function calls the function f while taking into account some of the limitations of the pandas UDF support: - support for keyword arguments - support for scalar values (as long as they are picklable) - support for type hints and input checks. :param f: the function to call. It is expected to have field annotations (see below). :param return_sig: the return type :param args: the arguments of the function :param kwargs: the kwargs to pass to the function :return: the value of executing the function: f(*args, **kwargs) The way this function executes depends on the what is provided as arguments: - if one of the arguments is a koalas series or dataframe: - the function is wrapped as a Spark UDF - the series arguments are checked to be coming from the same original anchor - the non-series arguments are serialized into the spark UDF. The function is expected to have the following arguments: """ from databricks.koalas.series import Series # All the arguments. # None for columns or the value for non-columns frozen_args = [] # type: typing.List[typing.Any] # ks.Series for columns or None for the non-columns col_args = [] # type: typing.List[typing.Optional[Series]] for arg in args: if isinstance(arg, Series): frozen_args.append(None) col_args.append(arg) elif isinstance(arg, Column): raise ValueError('A pyspark column was passed as an argument.' ' Pass a koalas series instead') else: frozen_args.append(arg) col_args.append(None) # Value is none for kwargs that are columns, and the value otherwise frozen_kwargs = [] # type: typing.List[typing.Tuple[str, typing.Any]] # Value is a spark col for kwarg that is column, and None otherwise col_kwargs = [] # type: typing.List[typing.Tuple[str, Series]] for (key, arg) in kwargs.items(): if isinstance(arg, Series): col_kwargs.append((key, arg)) elif isinstance(arg, Column): raise ValueError('A pyspark column was passed as an argument.' ' Pass a koalas series instead') else: frozen_kwargs.append((key, arg)) col_args_idxs = [idx for (idx, c) in enumerate(col_args) if c is not None] all_indexes = (col_args_idxs + [key for (key, _) in col_kwargs]) # type: ignore if not all_indexes: # No argument is related to spark # The function is just called through without other considerations. return f(*args, **kwargs) # We detected some columns. They need to be wrapped in a UDF to spark. (index_map, kdf) = _get_metadata(args, kwargs) def clean_fun(*args2): assert len(args2) == len(all_indexes), \ "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2]) full_args = list(frozen_args) full_kwargs = dict(frozen_kwargs) for (arg, idx) in zip(args2, all_indexes): if isinstance(idx, int): full_args[idx] = arg else: assert isinstance(idx, str), str(idx) full_kwargs[idx] = arg return f(*full_args, **full_kwargs) wrapped_udf = pandas_udf(clean_fun, returnType=return_type) name_tokens = [] spark_col_args = [] for col in col_args: if col is not None: spark_col_args.append(col._scol) name_tokens.append(col.name) kw_name_tokens = [] for (key, col) in col_kwargs: spark_col_args.append(col._scol) kw_name_tokens.append("{}={}".format(key, col.name)) col = wrapped_udf(*spark_col_args) series = Series(data=col, index=index_map, anchor=kdf) all_name_tokens = name_tokens + sorted(kw_name_tokens) name = "{}({})".format(f.__name__, ", ".join(all_name_tokens)) series = series.astype(return_type).alias(name) return series
def _cum(self, func): groupkey_scols = [s._scol for s in self._groupkeys] return Series._cum(self._ks, func, True, part_cols=groupkey_scols)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels data_spark_columns = self._internal.data_spark_columns returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError("Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][ list(self._kdf_or_kser.columns) ] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: return self._kdf_or_kser._kser_for(column_labels[0]) if remaining_index is not None: index_scols = self._internal.index_spark_columns[-remaining_index:] index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:]) else: index_scols = self._internal.index_spark_columns index_map = self._internal.index_map if len(column_labels) > 0: column_labels = column_labels.copy() column_labels_level = max( len(label) if label is not None else 1 for label in column_labels ) none_column = 0 for i, label in enumerate(column_labels): if label is None: label = (str(none_column),) none_column += 1 if len(label) < column_labels_level: label = tuple(list(label) + ([""]) * (column_labels_level - len(label))) column_labels[i] = label if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names column_label_names = self._internal.column_label_names[-column_labels_level:] else: column_label_names = None try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) data_columns = sdf.select(data_spark_columns).columns sdf = sdf.select(index_scols + data_spark_columns) except AnalysisException: raise KeyError( "[{}] don't exist in columns".format( [col._jc.toString() for col in data_spark_columns] ) ) internal = _InternalFrame( spark_frame=sdf, index_map=index_map, column_labels=column_labels, data_spark_columns=[scol_for(sdf, col) for col in data_columns], column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf ) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond = self._select_rows(rows_sel) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise LocIndexer._raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column) if cols_sel is None: column_index = self._internal.column_index columns = self._internal.column_scols elif isinstance(cols_sel, (str, tuple)): if isinstance(cols_sel, str): cols_sel = (cols_sel,) column_index, columns, returns_series = \ self._get_from_multiindex_column(cols_sel) elif isinstance(cols_sel, spark.Column): columns = [cols_sel] column_index = None elif all(isinstance(key, Series) for key in cols_sel): columns = [_make_col(key) for key in cols_sel] column_index = [key._internal.column_index[0] for key in cols_sel] elif all(isinstance(key, spark.Column) for key in cols_sel): columns = cols_sel column_index = None elif (any(isinstance(key, str) for key in cols_sel) and any(isinstance(key, tuple) for key in cols_sel)): raise TypeError('Expected tuple, got str') else: if all(isinstance(key, tuple) for key in cols_sel): level = self._internal.column_index_level if any(len(key) != level for key in cols_sel): raise ValueError('All the key level should be the same as column index level.') column_to_index = list(zip(self._internal.data_columns, self._internal.column_index)) columns = [] column_index = [] for key in cols_sel: found = False for column, idx in column_to_index: if idx == key or idx[0] == key: columns.append(_make_col(column)) column_index.append(idx) found = True if not found: raise KeyError("['{}'] not in index".format(key)) if cond is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf