Beispiel #1
0
    def indexer_between_time(
        self,
        start_time: Union[datetime.time, str],
        end_time: Union[datetime.time, str],
        include_start: bool = True,
        include_end: bool = True,
    ) -> Index:
        """
        Return index locations of values between particular times of day
        (e.g., 9:00-9:30AM).

        Parameters
        ----------
        start_time, end_time : datetime.time, str
            Time passed either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p").
        include_start : bool, default True
        include_end : bool, default True

        Returns
        -------
        values_between_time : Index of integers

        Examples
        --------
        >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T")
        >>> kidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> kidx.indexer_between_time("00:01", "00:02").sort_values()
        Int64Index([1, 2], dtype='int64')

        >>> kidx.indexer_between_time("00:01", "00:02", include_end=False)
        Int64Index([1], dtype='int64')

        >>> kidx.indexer_between_time("00:01", "00:02", include_start=False)
        Int64Index([2], dtype='int64')
        """
        def pandas_between_time(pdf) -> ks.DataFrame[int]:
            return pdf.between_time(start_time, end_time, include_start,
                                    include_end)

        kdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(kdf, "__id_column__")
        kdf = kdf.koalas.attach_id_column("distributed-sequence",
                                          id_column_name)
        with ks.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            kdf = kdf.koalas.apply_batch(pandas_between_time)
        return ks.Index(first_series(kdf).rename(self.name))
Beispiel #2
0
    def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index:
        """
        Return index locations of values at particular time of day
        (e.g. 9:30AM).

        Parameters
        ----------
        time : datetime.time or str
            Time passed in either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p").

        Returns
        -------
        values_at_time : Index of integers

        Examples
        --------
        >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T")
        >>> kidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> kidx.indexer_at_time("00:00")
        Int64Index([0], dtype='int64')

        >>> kidx.indexer_at_time("00:01")
        Int64Index([1], dtype='int64')
        """
        if asof:
            raise NotImplementedError("'asof' argument is not supported")

        def pandas_at_time(pdf) -> ks.DataFrame[int]:
            return pdf.at_time(time, asof)

        kdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(kdf, "__id_column__")
        kdf = kdf.koalas.attach_id_column("distributed-sequence", id_column_name)
        with ks.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            kdf = kdf.koalas.apply_batch(pandas_at_time)
        return ks.Index(first_series(kdf).rename(self.name))
Beispiel #3
0
    def _is_monotonic_decreasing(self):
        scol = self.spark.column
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
        prev = F.lag(scol, 1).over(window)

        cond = F.lit(True)
        has_not_null = F.lit(True)
        for field in self.spark.data_type[::-1]:
            left = scol.getField(field.name)
            right = prev.getField(field.name)
            compare = MultiIndex._comparator_for_monotonic_decreasing(
                field.dataType)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & left.isNotNull()
            cond = F.when(left.eqNullSafe(right), cond).otherwise(
                compare(left, right, spark.Column.__lt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(
                self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)])

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_dtypes=self._internal.index_dtypes,
        )

        return first_series(DataFrame(internal))
Beispiel #4
0
    def attach_distributed_sequence_column(sdf, column_name):
        """
        This method attaches a Spark column that has a sequence in a distributed manner.
        This is equivalent to the column assigned when default index type 'distributed-sequence'.

        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.sort("sequence").show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """

        scols = [scol_for(sdf, column) for column in sdf.columns]

        spark_partition_column = verify_temp_column_name(
            sdf, "__spark_partition_id__")
        offset_column = verify_temp_column_name(sdf, "__offset__")
        row_number_column = verify_temp_column_name(sdf, "__row_number__")

        # 1. Calculates counts per each partition ID. `counts` here is, for instance,
        #     {
        #         1: 83,
        #         6: 83,
        #         3: 83,
        #         ...
        #     }
        sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id())
        counts = map(
            lambda x: (x["key"], x["count"]),
            sdf.groupby(
                sdf[spark_partition_column].alias("key")).count().collect(),
        )

        # 2. Calculates cumulative sum in an order of partition id.
        #     Note that it does not matter if partition id guarantees its order or not.
        #     We just need a one-by-one sequential id.

        # sort by partition key.
        sorted_counts = sorted(counts, key=lambda x: x[0])
        # get cumulative sum in an order of partition key.
        cumulative_counts = [0] + list(
            accumulate(map(lambda count: count[1], sorted_counts)))
        # zip it with partition key.
        sums = dict(
            zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

        # 3. Attach offset for each partition.
        @pandas_udf(LongType(), PandasUDFType.SCALAR)
        def offset(id):
            current_partition_offset = sums[id.iloc[0]]
            return pd.Series(current_partition_offset).repeat(len(id))

        sdf = sdf.withColumn(offset_column, offset(spark_partition_column))

        # 4. Calculate row_number in each partition.
        w = Window.partitionBy(spark_partition_column).orderBy(
            F.monotonically_increasing_id())
        row_number = F.row_number().over(w)
        sdf = sdf.withColumn(row_number_column, row_number)

        # 5. Calculate the index.
        return sdf.select((sdf[offset_column] + sdf[row_number_column] -
                           1).alias(column_name), *scols)
Beispiel #5
0
    def transform_batch(self, func, *args, **kwargs):
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.koalas.apply_batch: For row/columnwise operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ks.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ks.Series[int]:
        ...     return pdf.B + 1
        >>> df.koalas.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int32

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).koalas.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.koalas.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.koalas.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import first_series
        from databricks import koalas as ks

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._kdf._internal.to_internal_spark_frame.schema.names
        should_by_pass = LooseVersion(pyspark.__version__) >= "3.0"

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf = pdf.rename(columns=dict(zip(pdf.columns, names)))
            return pdf

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: ff(pandas_concat(series))

        def pandas_frame_func(f):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)),
                                                  field.name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self._kdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed))
            if len(transformed) != len(pdf):
                raise ValueError(
                    "transform_batch cannot produce aggregated results")
            kdf_or_kser = ks.from_pandas(transformed)

            if isinstance(kdf_or_kser, ks.Series):
                kser = kdf_or_kser
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=kser.spark.data_type,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._kdf._internal.copy(
                    column_labels=kser._internal.column_labels,
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(
                                kser._internal.data_spark_column_names[0])
                    ],
                    column_label_names=kser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                kdf = kdf_or_kser
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return kdf

                return_schema = kdf._internal.to_internal_spark_frame.schema
                # Force nullability.
                return_schema = StructType([
                    StructField(field.name, field.dataType)
                    for field in return_schema.fields
                ])

                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                columns = self_applied._internal.spark_columns
                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(kdf._internal.with_new_sdf(sdf))
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig)
            if is_return_series:
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=return_schema,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                internal = self._kdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(SPARK_DEFAULT_SERIES_NAME)
                    ],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                columns = self_applied._internal.spark_columns

                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(sdf)
Beispiel #6
0
 def _sequence_col(self):
     internal = super(iLocIndexer, self)._internal
     return verify_temp_column_name(internal.sdf,
                                    "__distributed_sequence_column__")
Beispiel #7
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series, _col

        if self._is_series:
            if (isinstance(key, Series)
                    and key._kdf is not self._kdf_or_kser._kdf) or (
                        isinstance(value, Series)
                        and value._kdf is not self._kdf_or_kser._kdf):
                kdf = self._kdf_or_kser.to_frame()
                temp_natural_order = verify_temp_column_name(
                    kdf, "__temp_natural_order__")
                temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
                temp_value_col = verify_temp_column_name(
                    kdf, "__temp_value_col__")

                kdf[temp_natural_order] = F.monotonically_increasing_id()
                if isinstance(key, Series):
                    kdf[temp_key_col] = key
                if isinstance(value, Series):
                    kdf[temp_value_col] = value
                kdf = kdf.sort_values(temp_natural_order).drop(
                    temp_natural_order)

                kser = kdf[self._kdf_or_kser.name]
                if isinstance(key, Series):
                    key = kdf[temp_key_col]
                if isinstance(value, Series):
                    value = kdf[temp_value_col]

                type(self)(kser)[key] = value

                self._kdf_or_kser._internal = kser._internal
                self._kdf_or_kser._kdf = kser._kdf
                return

            if isinstance(value, DataFrame):
                raise ValueError("Incompatible indexer with DataFrame")

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None:
                cond = F.lit(True)
            if limit is not None:
                cond = cond & (self._internal.spark_frame[self._sequence_col] <
                               F.lit(limit))

            if isinstance(value, Series):
                if remaining_index is not None and remaining_index == 0:
                    raise ValueError(
                        "No axis named {} for object type {}".format(
                            key, type(value)))
                value = value._scol
            else:
                value = F.lit(value)
            scol = (F.when(cond,
                           value).otherwise(self._internal.spark_column).alias(
                               name_like_string(self._kdf_or_kser.name
                                                or "0")))
            internal = self._internal.copy(spark_column=scol)
            self._kdf_or_kser._internal = internal
        else:
            assert self._is_df

            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(value, DataFrame):
                if len(value.columns) == 1:
                    value = _col(value)
                else:
                    raise ValueError(
                        "Only a dataframe with one column can be assigned")

            if (isinstance(rows_sel, Series)
                    and rows_sel._kdf is not self._kdf_or_kser) or (
                        isinstance(value, Series)
                        and value._kdf is not self._kdf_or_kser):
                kdf = self._kdf_or_kser.copy()
                temp_natural_order = verify_temp_column_name(
                    kdf, "__temp_natural_order__")
                temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
                temp_value_col = verify_temp_column_name(
                    kdf, "__temp_value_col__")

                kdf[temp_natural_order] = F.monotonically_increasing_id()
                if isinstance(rows_sel, Series):
                    kdf[temp_key_col] = rows_sel
                if isinstance(value, Series):
                    kdf[temp_value_col] = value
                kdf = kdf.sort_values(temp_natural_order)

                if isinstance(rows_sel, Series):
                    rows_sel = kdf[temp_key_col]
                if isinstance(value, Series):
                    value = kdf[temp_value_col]

                type(self)(kdf)[rows_sel, cols_sel] = value

                self._kdf_or_kser._internal = kdf[list(
                    self._kdf_or_kser.columns)]._internal
                return

            cond, limit, remaining_index = self._select_rows(rows_sel)
            missing_keys = []
            _, data_spark_columns, _ = self._select_cols(
                cols_sel, missing_keys=missing_keys)

            if cond is None:
                cond = F.lit(True)
            if limit is not None:
                cond = cond & (self._internal.spark_frame[self._sequence_col] <
                               F.lit(limit))

            if isinstance(value, Series):
                if remaining_index is not None and remaining_index == 0:
                    raise ValueError("Incompatible indexer with Series")
                if len(data_spark_columns) > 1:
                    raise ValueError("shape mismatch")
                value = value._scol
            else:
                value = F.lit(value)

            new_data_spark_columns = []
            for new_scol, spark_column_name in zip(
                    self._internal.data_spark_columns,
                    self._internal.data_spark_column_names):
                for scol in data_spark_columns:
                    if new_scol._jc.equals(scol._jc):
                        new_scol = F.when(
                            cond,
                            value).otherwise(scol).alias(spark_column_name)
                        break
                new_data_spark_columns.append(new_scol)

            column_labels = self._internal.column_labels.copy()
            for label in missing_keys:
                if isinstance(label, str):
                    label = (label, )
                if len(label) < self._internal.column_labels_level:
                    label = tuple(
                        list(label) +
                        ([""] *
                         (self._internal.column_labels_level - len(label))))
                elif len(label) > self._internal.column_labels_level:
                    raise KeyError(
                        "Key length ({}) exceeds index depth ({})".format(
                            len(label), self._internal.column_labels_level))
                column_labels.append(label)
                new_data_spark_columns.append(
                    F.when(cond, value).alias(name_like_string(label)))

            internal = self._internal.with_new_columns(new_data_spark_columns,
                                                       column_labels)
            self._kdf_or_kser._internal = internal
Beispiel #8
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                temp_col = verify_temp_column_name(kdf, "__temp_col__")

                kdf[temp_col] = key
                return type(self)(kdf[self._kdf_or_kser.name])[kdf[temp_col]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            data_spark_columns = self._internal.data_spark_columns
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                temp_col = verify_temp_column_name(kdf, "__temp_col__")

                kdf[temp_col] = rows_sel
                return type(self)(kdf)[kdf[temp_col], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, data_spark_columns, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return self._kdf_or_kser._kser_for(column_labels[0])

        if remaining_index is not None:
            index_scols = self._internal.index_spark_columns[-remaining_index:]
            index_map = OrderedDict(
                list(self._internal.index_map.items())[-remaining_index:])
        else:
            index_scols = self._internal.index_spark_columns
            index_map = self._internal.index_map

        if len(column_labels) > 0:
            column_labels = column_labels.copy()
            column_labels_level = max(
                len(label) if label is not None else 1
                for label in column_labels)
            none_column = 0
            for i, label in enumerate(column_labels):
                if label is None:
                    label = (str(none_column), )
                    none_column += 1
                if len(label) < column_labels_level:
                    label = tuple(
                        list(label) + ([""]) *
                        (column_labels_level - len(label)))
                column_labels[i] = label

            if self._internal.column_label_names is None:
                column_label_names = None
            else:
                # Manage column index names
                column_label_names = self._internal.column_label_names[
                    -column_labels_level:]
        else:
            column_label_names = None

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            data_columns = sdf.select(data_spark_columns).columns
            sdf = sdf.select(index_scols + data_spark_columns)
        except AnalysisException:
            raise KeyError("[{}] don't exist in columns".format(
                [col._jc.toString() for col in data_spark_columns]))

        internal = _InternalFrame(
            spark_frame=sdf,
            index_map=index_map,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(kdf._internal.copy(
                spark_column=kdf._internal.data_spark_columns[0]),
                                 anchor=kdf)
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser