Example #1
0
    def _apply_as_series_or_frame(self, func):
        """
        Decorator that can wraps a function that handles Spark column in order
        to support it in both Koalas Series and DataFrame.
        Note that the given `func` name should be same as the API's method name.
        """
        from databricks.koalas import DataFrame, Series

        if isinstance(self.kdf_or_kser, Series):
            kser = self.kdf_or_kser
            return kser._with_new_scol(func(kser._scol)).rename(kser.name)
        elif isinstance(self.kdf_or_kser, DataFrame):
            kdf = self.kdf_or_kser
            applied = []
            for column in kdf.columns:
                applied.append(
                    getattr(
                        kdf[column].rolling(self._window_val + 1,
                                            self._min_periods),
                        func.__name__)())

            sdf = kdf._sdf.select(kdf._internal.index_scols +
                                  [c._scol for c in applied])
            internal = kdf._internal.copy(
                sdf=sdf,
                data_columns=[c._internal.data_columns[0] for c in applied],
                column_index=[c._internal.column_index[0] for c in applied])
            return DataFrame(internal)
Example #2
0
    def _apply_as_series_or_frame(self, func):
        """
        Wraps a function that handles Spark column in order
        to support it in both Koalas Series and DataFrame.

        Note that the given `func` name should be same as the API's method name.
        """
        from databricks.koalas import DataFrame
        from databricks.koalas.series import _col
        from databricks.koalas.groupby import SeriesGroupBy

        kdf = self.kdf
        sdf = self.kdf._sdf

        # Here we need to include grouped key as an index, and shift previous index.
        #   [index_column0, index_column1] -> [grouped key, index_column0, index_column1]
        new_index_scols = []
        new_index_map = []
        for groupkey in self._groupkeys:
            new_index_scols.append(
                # NOTE THAT this code intentionally uses `F.col` instead of `scol` in
                # given series. This is because, in case of series, we convert it into
                # DataFrame. So, if the given `groupkeys` is a series, they end up with
                # being a different series.
                F.col(name_like_string(groupkey.name)
                      ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)),
                                  groupkey._internal.column_index[0]))

        for new_index_scol, index_map in zip(kdf._internal.index_scols,
                                             kdf._internal.index_map):
            new_index_scols.append(
                new_index_scol.alias(
                    SPARK_INDEX_NAME_FORMAT(len(new_index_scols))))
            _, name = index_map
            new_index_map.append(
                (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name))

        applied = []
        for column in kdf.columns:
            applied.append(kdf[column]._with_new_scol(func(
                kdf[column]._scol)).rename(kdf[column].name))

        # Seems like pandas filters out when grouped key is NA.
        cond = self._groupkeys[0]._scol.isNotNull()
        for c in self._groupkeys:
            cond = cond | c._scol.isNotNull()
        sdf = sdf.select(new_index_scols + [c._scol
                                            for c in applied]).filter(cond)

        internal = _InternalFrame(
            sdf=sdf,
            data_columns=[c._internal.data_columns[0] for c in applied],
            index_map=new_index_map)

        ret = DataFrame(internal)
        if isinstance(self._groupby, SeriesGroupBy):
            return _col(ret)
        else:
            return ret
Example #3
0
    def get_sampled(self, data):
        from databricks.koalas import DataFrame, Series

        self.fraction = 1 / (len(data) / 1000)  # make sure the records are roughly 1000.
        if self.fraction > 1:
            self.fraction = 1

        if isinstance(data, DataFrame):
            sampled = data._sdf.sample(fraction=float(self.fraction))
            return DataFrame(data._internal.copy(sdf=sampled)).to_pandas()
        elif isinstance(data, Series):
            scol = data._scol
            sampled = data._kdf._sdf.sample(fraction=float(self.fraction))
            return DataFrame(data._kdf._internal.copy(sdf=sampled, scol=scol)).to_pandas()
        else:
            ValueError("Only DataFrame and Series are supported for plotting.")
Example #4
0
    def get_sampled(self, data):
        from databricks.koalas import DataFrame

        self.fraction = 1 / (len(data) / 1000)  # make sure the records are roughly 1000.
        if self.fraction > 1:
            self.fraction = 1
        sampled = data._kdf._sdf.sample(fraction=float(self.fraction))
        return DataFrame(data._kdf._internal.copy(sdf=sampled)).to_pandas()
Example #5
0
    def get_sampled(self, data):
        from databricks.koalas import DataFrame, Series
        fraction = get_option("plotting.sample_ratio")
        if fraction is None:
            fraction = 1 / (len(data) / get_option("plotting.max_rows"))
            fraction = min(1., fraction)
        self.fraction = fraction

        if isinstance(data, (DataFrame, Series)):
            if isinstance(data, Series):
                data = data.to_frame()
            sampled = data._sdf.sample(fraction=self.fraction)
            return DataFrame(data._internal.copy(sdf=sampled)).to_pandas()
        else:
            ValueError("Only DataFrame and Series are supported for plotting.")
Example #6
0
    def hint(self, name: str, *parameters) -> "ks.DataFrame":
        """
        Specifies some hint on the current DataFrame.

        Parameters
        ----------
        name : A name of the hint.
        parameters : Optional parameters.

        Returns
        -------
        ret : DataFrame with the hint.

        See Also
        --------
        broadcast : Marks a DataFrame as small enough for use in broadcast joins.

        Examples
        --------
        >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
        ...                     'value': [1, 2, 3, 5]},
        ...                    columns=['lkey', 'value'])
        >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
        ...                     'value': [5, 6, 7, 8]},
        ...                    columns=['rkey', 'value'])
        >>> merged = df1.merge(df2.spark.hint("broadcast"), left_on='lkey', right_on='rkey')
        >>> merged.spark.explain()  # doctest: +ELLIPSIS
        == Physical Plan ==
        ...
        ...BroadcastHashJoin...
        ...
        """
        from databricks.koalas.frame import DataFrame

        return DataFrame(
            self._kdf._internal.with_new_sdf(
                self._kdf._internal.spark_frame.hint(name, *parameters)
            )
        )
Example #7
0
    def apply(self, func):
        """
        Applies a function that takes and returns a Spark column. It allows to natively
        apply a Spark function and column APIs with the Spark column internally used
        in Series or Index.

        .. note:: It forces to lose the index and end up with using default index. It is
            preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply`
            with specifying the `inedx_col`.

        .. note:: It does not require to have the same length of the input and output.
            However, it requires to create a new DataFrame internally which will require
            to set `compute.ops_on_diff_frames` to compute even with the same origin
            DataFrame that is expensive, whereas :meth:`Series.spark.transform` does not
            require it.

        Parameters
        ----------
        func : function
            Function to apply the function against the data by using Spark columns.

        Returns
        -------
        Series

        Raises
        ------
        ValueError : If the output from the function is not a Spark column.

        Examples
        --------
        >>> from databricks import koalas as ks
        >>> from pyspark.sql.functions import count, lit
        >>> df = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
        >>> df
           a  b
        0  1  4
        1  2  5
        2  3  6

        >>> df.a.spark.apply(lambda c: count(c))
        0    3
        Name: a, dtype: int64

        >>> df.a.spark.apply(lambda c: c + df.b.spark.column)
        0    5
        1    7
        2    9
        Name: a, dtype: int64
        """
        from databricks.koalas import Index, DataFrame, Series
        from databricks.koalas.series import first_series
        from databricks.koalas.internal import HIDDEN_COLUMNS

        if isinstance(self._data, Index):
            raise NotImplementedError(
                "Index does not support spark.apply yet.")
        output = func(self._data.spark.column)
        if not isinstance(output, Column):
            raise ValueError("The output of the function [%s] should be of a "
                             "pyspark.sql.Column; however, got [%s]." %
                             (func, type(output)))
        assert isinstance(self._data, Series)

        sdf = self._data._internal.spark_frame.drop(
            *HIDDEN_COLUMNS).select(output)
        # Lose index.
        kdf = DataFrame(sdf)
        kdf.columns = [self._data.name]
        return first_series(kdf)
Example #8
0
def combine_frames(this, *args, how="full"):
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from databricks.koalas import Series
    from databricks.koalas import DataFrame
    from databricks.koalas.config import get_option

    if all(isinstance(arg, Series) for arg in args):
        assert all(arg._kdf is args[0]._kdf for arg in args), \
            "Currently only one different DataFrame (from given Series) is supported"
        if this is args[0]._kdf:
            return  # We don't need to combine. All series is in this.
        that = args[0]._kdf[[ser.name for ser in args]]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        if this is args[0]:
            return  # We don't need to combine. `this` and `that` are same.
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or "
                             "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):
        this_index_map = this._internal.index_map
        that_index_map = that._internal.index_map
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # If the same named index is found, that's used.
        for this_column, this_name in this_index_map:
            for that_col, that_name in that_index_map:
                if this_name == that_name:
                    # We should merge the Spark columns into one
                    # to mimic pandas' behavior.
                    this_scol = this._internal.scol_for(this_column)
                    that_scol = that._internal.scol_for(that_col)
                    join_scol = this_scol == that_scol
                    join_scols.append(join_scol)
                    merged_index_scols.append(
                        F.when(
                            this_scol.isNotNull(),
                            this_scol).otherwise(that_scol).alias(this_column))
                    break
            else:
                raise ValueError(
                    "Index names must be exactly matched currently.")

        assert len(
            join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this._sdf.alias("this").join(that._sdf.alias("that"),
                                                 on=join_scols,
                                                 how=how)

        joined_df = joined_df.select(merged_index_scols + [
            this[idx]._scol.alias("__this_%s" %
                                  this._internal.column_name_for(idx))
            for idx in this._internal.column_index
        ] + [
            that[idx]._scol.alias("__that_%s" %
                                  that._internal.column_name_for(idx))
            for idx in that._internal.column_index
        ])

        index_columns = set(this._internal.index_columns)
        new_data_columns = [
            c for c in joined_df.columns if c not in index_columns
        ]
        level = max(this._internal.column_index_level,
                    that._internal.column_index_level)
        column_index = ([
            tuple(['this'] + ([''] * (level - len(idx))) + list(idx))
            for idx in this._internal.column_index
        ] + [
            tuple(['that'] + ([''] * (level - len(idx))) + list(idx))
            for idx in that._internal.column_index
        ])
        column_index_names = (
            (([None] * (1 + level - len(this._internal.column_index_level))) +
             this._internal.column_index_names)
            if this._internal.column_index_names is not None else None)
        return DataFrame(
            this._internal.copy(sdf=joined_df,
                                data_columns=new_data_columns,
                                column_index=column_index,
                                column_index_names=column_index_names))
    else:
        raise ValueError("Cannot combine column argument because "
                         "it comes from a different dataframe")
def combine_frames(this, *args, how="full"):
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from databricks.koalas import Series
    from databricks.koalas import DataFrame
    from databricks.koalas.config import get_option

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            arg._kdf is args[0]._kdf for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        if this is args[0]._kdf:
            return  # We don't need to combine. All series is in this.
        that = args[0]._kdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        if this is args[0]:
            return  # We don't need to combine. `this` and `that` are same.
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or "
                             "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):
        this_index_map = this._internal.index_map
        that_index_map = that._internal.index_map
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = zip(this_index_map.items(),
                                      that_index_map.items())

        # If the same named index is found, that's used.
        for (this_column, this_name), (that_column,
                                       that_name) in this_and_that_index_map:
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this._sdf, this_column)
                that_scol = scol_for(that._sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(),
                           this_scol).otherwise(that_scol).alias(this_column))
            else:
                raise ValueError(
                    "Index names must be exactly matched currently.")

        assert len(
            join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this._sdf.alias("this").join(that._sdf.alias("that"),
                                                 on=join_scols,
                                                 how=how)

        joined_df = joined_df.select(merged_index_scols + [
            this[label]._scol.alias(
                "__this_%s" % this._internal.spark_column_name_for(label))
            for label in this._internal.column_labels
        ] + [
            that[label]._scol.alias(
                "__that_%s" % that._internal.spark_column_name_for(label))
            for label in that._internal.column_labels
        ])

        index_columns = set(this._internal.index_spark_column_names)
        new_data_columns = [
            c for c in joined_df.columns if c not in index_columns
        ]
        level = max(this._internal.column_labels_level,
                    that._internal.column_labels_level)
        column_labels = [
            tuple(["this"] + ([""] * (level - len(label))) + list(label))
            for label in this._internal.column_labels
        ] + [
            tuple(["that"] + ([""] * (level - len(label))) + list(label))
            for label in that._internal.column_labels
        ]
        column_label_names = (
            (([None] * (1 + level - len(this._internal.column_labels_level))) +
             this._internal.column_label_names)
            if this._internal.column_label_names is not None else None)
        return DataFrame(
            this._internal.copy(
                spark_frame=joined_df,
                column_labels=column_labels,
                data_spark_columns=[
                    scol_for(joined_df, col) for col in new_data_columns
                ],
                column_label_names=column_label_names,
            ))
    else:
        raise ValueError(
            "Cannot combine the series or dataframe because it comes from a different dataframe. "
            "In order to allow this operation, enable 'compute.ops_on_diff_frames' option."
        )
Example #10
0
    def count(self):
        """
        The expanding count of any non-NaN observations inside the window.

        .. note:: the current implementation of this API uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        Series or DataFrame
            Returned object type is determined by the caller of the expanding
            calculation.

        See Also
        --------
        Series.expanding : Calling object with Series data.
        DataFrame.expanding : Calling object with DataFrames.
        DataFrame.count : Count of the full DataFrame.

        Examples
        --------
        >>> s = ks.Series([2, 3, float("nan"), 10])
        >>> s.expanding().count()
        0    1.0
        1    2.0
        2    2.0
        3    3.0
        Name: 0, dtype: float64

        >>> s.to_frame().expanding().count()
             0
        0  1.0
        1  2.0
        2  2.0
        3  3.0
        """
        from databricks.koalas import DataFrame, Series

        if isinstance(self.kdf_or_kser, Series):
            kser = self.kdf_or_kser
            # TODO: is this a bug? min_periods is not respected in expanding().count() in pandas.
            # scol = F.when(
            #     F.row_number().over(self._window) > self._min_periods,
            #     F.count(kser._scol).over(self._window)
            # ).otherwise(F.lit(None))
            scol = F.count(kser._scol).over(self._window)
            return kser._with_new_scol(scol).astype('float64').rename(kser.name)
        elif isinstance(self.kdf_or_kser, DataFrame):
            # TODO: deduplicate with other APIs in expanding.
            kdf = self.kdf_or_kser
            applied = []
            for column in kdf.columns:
                applied.append(kdf[column].expanding(self._min_periods).count())

            sdf = kdf._sdf.select(
                kdf._internal.index_scols + [c._scol for c in applied])
            internal = kdf._internal.copy(
                sdf=sdf,
                data_columns=[c._internal.data_columns[0] for c in applied],
                column_index=[c._internal.column_index[0] for c in applied])
            return DataFrame(internal)
Example #11
0
def combine_frames(this, *args, how="full"):
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `OPS_ON_DIFF_FRAMES` environment variable is not set,
    this method throws an exception.
    """
    from databricks.koalas import Series
    from databricks.koalas import DataFrame

    if all(isinstance(arg, Series) for arg in args):
        assert all(arg._kdf is args[0]._kdf for arg in args), \
            "Currently only one different DataFrame (from given Series) is supported"
        if this is args[0]._kdf:
            return  # We don't need to combine. All series is in this.
        that = args[0]._kdf[[ser.name for ser in args]]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        if this is args[0]:
            return  # We don't need to combine. `this` and `that` are same.
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or "
                             "single/multiple Series")

    if os.environ.get("OPS_ON_DIFF_FRAMES", "false").lower() == "true":
        this_index_map = this._internal.index_map
        this_data_columns = this._internal.data_columns
        that_index_map = that._internal.index_map
        that_data_columns = that._internal.data_columns
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # If the same named index is found, that's used.
        for this_column, this_name in this_index_map:
            for that_col, that_name in that_index_map:
                if this_name == that_name:
                    # We should merge the Spark columns into one
                    # to mimic pandas' behavior.
                    this_scol = this._internal.scol_for(this_column)
                    that_scol = that._internal.scol_for(that_col)
                    join_scol = this_scol == that_scol
                    join_scols.append(join_scol)
                    merged_index_scols.append(
                        F.when(
                            this_scol.isNotNull(),
                            this_scol).otherwise(that_scol).alias(this_column))
                    break
            else:
                raise ValueError(
                    "Index names must be exactly matched currently.")

        assert len(
            join_scols) > 0, "cannot join with no overlapping index names"

        index_columns = this._internal.index_columns
        joined_df = this._sdf.alias("this").join(that._sdf.alias("that"),
                                                 on=join_scols,
                                                 how=how)

        joined_df = joined_df.select(merged_index_scols + [
            this[c]._scol.alias("__this_%s" % this[c].name)
            for c in this_data_columns
        ] + [
            that[c]._scol.alias("__that_%s" % that[c].name)
            for c in that_data_columns
        ])

        new_data_columns = [
            c for c in joined_df.columns if c not in index_columns
        ]
        return DataFrame(
            this._internal.copy(sdf=joined_df, data_columns=new_data_columns))
    else:
        raise ValueError("Cannot combine column argument because "
                         "it comes from a different dataframe")
Example #12
0
def align_diff_frames(resolve_func, this, that, fillna=True, how="full"):
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `OPS_ON_DIFF_FRAMES` environment variable should be enabled, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> import os
        >>>
        >>> prev = os.environ.get("OPS_ON_DIFF_FRAMES", "false")
        >>> os.environ["OPS_ON_DIFF_FRAMES"] = "true"
        >>>
        >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(kdf, this_columns, that_columns):
        ...    kdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `kdf`.
        ...    this_column = this_columns[0]  # this is 'a' from kdf1.
        ...    that_column = that_columns[0]  # this is 'a' from kdf2.
        ...    new_series = kdf[this_column] - kdf[that_column]
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield new_series.rename(this_column)  # or list(new_series)
        >>>
        >>>
        >>> align_diff_frames(func, kdf1, kdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> os.environ["OPS_ON_DIFF_FRAMES"] = prev

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
    :return: Alined DataFrame
    """
    from databricks.koalas import DataFrame

    assert how == "full" or how == "left"

    this_data_columns = this._internal.data_columns
    that_data_columns = that._internal.data_columns
    common_columns = set(this_data_columns).intersection(that_data_columns)

    # 1. Full outer join given two dataframes.
    combined = combine_frames(this, that, how=how)

    # 2. Apply given function to transform the columns in a batch and keep the new columns.
    combined_data_columns = combined._internal.data_columns

    that_columns_to_apply = []
    this_columns_to_apply = []
    additional_that_columns = []
    columns_to_keep = []

    for combined_column in combined_data_columns:
        for common_column in common_columns:
            if combined_column == "__this_%s" % common_column:
                this_columns_to_apply.append(combined_column)
                break
            elif combined_column == "__that_%s" % common_column:
                that_columns_to_apply.append(combined_column)
                break
        else:
            if how == "left" and \
                    combined_column in ["__that_%s" % c for c in that_data_columns]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_column)
            elif fillna:
                columns_to_keep.append(
                    F.lit(None).cast(FloatType()).alias(combined_column))
            else:
                columns_to_keep.append(F.col(combined_column))

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    kser_set = list(
        resolve_func(combined, this_columns_to_apply, that_columns_to_apply))
    columns_applied = [c._scol for c in kser_set]

    sdf = combined._sdf.select(combined._internal.index_scols +
                               columns_applied + columns_to_keep)

    # 3. Restore the names back and deduplicate columns.
    this_columns = OrderedDict()
    # Add columns in an order of its original frame.
    new_data_columns = [
        c for c in sdf.columns if c not in combined._internal.index_columns
    ]
    for this_data_column in this_data_columns:
        for new_column in new_data_columns:
            striped = new_column
            if new_column.startswith("__this_") or new_column.startswith(
                    "__that_"):
                striped = new_column[
                    7:]  # cut out the prefix (either __this_ or __that_).

            # Respect the applied columns first if there are duplicated columns found.
            if striped not in this_columns and this_data_column == striped:
                this_columns[striped] = F.col(new_column).alias(striped)
                break

    # After that, we will add the rest columns.
    other_columns = OrderedDict()
    for new_column in new_data_columns:
        striped = new_column
        if new_column.startswith("__this_") or new_column.startswith(
                "__that_"):
            striped = new_column[
                7:]  # cut out the prefix (either __this_ or __that_).

        # Respect the applied columns first if there are duplicated columns found.
        if striped not in this_columns:
            other_columns[striped] = F.col(new_column).alias(striped)

    sdf = sdf.select(combined._internal.index_scols +
                     list(this_columns.values()) +
                     list(other_columns.values()))

    new_data_columns = [
        c for c in sdf.columns if c not in combined._internal.index_columns
    ]
    internal = combined._internal.copy(sdf=sdf, data_columns=new_data_columns)
    return DataFrame(internal)