def _apply_as_series_or_frame(self, func): """ Decorator that can wraps a function that handles Spark column in order to support it in both Koalas Series and DataFrame. Note that the given `func` name should be same as the API's method name. """ from databricks.koalas import DataFrame, Series if isinstance(self.kdf_or_kser, Series): kser = self.kdf_or_kser return kser._with_new_scol(func(kser._scol)).rename(kser.name) elif isinstance(self.kdf_or_kser, DataFrame): kdf = self.kdf_or_kser applied = [] for column in kdf.columns: applied.append( getattr( kdf[column].rolling(self._window_val + 1, self._min_periods), func.__name__)()) sdf = kdf._sdf.select(kdf._internal.index_scols + [c._scol for c in applied]) internal = kdf._internal.copy( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], column_index=[c._internal.column_index[0] for c in applied]) return DataFrame(internal)
def _apply_as_series_or_frame(self, func): """ Wraps a function that handles Spark column in order to support it in both Koalas Series and DataFrame. Note that the given `func` name should be same as the API's method name. """ from databricks.koalas import DataFrame from databricks.koalas.series import _col from databricks.koalas.groupby import SeriesGroupBy kdf = self.kdf sdf = self.kdf._sdf # Here we need to include grouped key as an index, and shift previous index. # [index_column0, index_column1] -> [grouped key, index_column0, index_column1] new_index_scols = [] new_index_map = [] for groupkey in self._groupkeys: new_index_scols.append( # NOTE THAT this code intentionally uses `F.col` instead of `scol` in # given series. This is because, in case of series, we convert it into # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. F.col(name_like_string(groupkey.name) ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)), groupkey._internal.column_index[0])) for new_index_scol, index_map in zip(kdf._internal.index_scols, kdf._internal.index_map): new_index_scols.append( new_index_scol.alias( SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) _, name = index_map new_index_map.append( (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name)) applied = [] for column in kdf.columns: applied.append(kdf[column]._with_new_scol(func( kdf[column]._scol)).rename(kdf[column].name)) # Seems like pandas filters out when grouped key is NA. cond = self._groupkeys[0]._scol.isNotNull() for c in self._groupkeys: cond = cond | c._scol.isNotNull() sdf = sdf.select(new_index_scols + [c._scol for c in applied]).filter(cond) internal = _InternalFrame( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], index_map=new_index_map) ret = DataFrame(internal) if isinstance(self._groupby, SeriesGroupBy): return _col(ret) else: return ret
def get_sampled(self, data): from databricks.koalas import DataFrame, Series self.fraction = 1 / (len(data) / 1000) # make sure the records are roughly 1000. if self.fraction > 1: self.fraction = 1 if isinstance(data, DataFrame): sampled = data._sdf.sample(fraction=float(self.fraction)) return DataFrame(data._internal.copy(sdf=sampled)).to_pandas() elif isinstance(data, Series): scol = data._scol sampled = data._kdf._sdf.sample(fraction=float(self.fraction)) return DataFrame(data._kdf._internal.copy(sdf=sampled, scol=scol)).to_pandas() else: ValueError("Only DataFrame and Series are supported for plotting.")
def get_sampled(self, data): from databricks.koalas import DataFrame self.fraction = 1 / (len(data) / 1000) # make sure the records are roughly 1000. if self.fraction > 1: self.fraction = 1 sampled = data._kdf._sdf.sample(fraction=float(self.fraction)) return DataFrame(data._kdf._internal.copy(sdf=sampled)).to_pandas()
def get_sampled(self, data): from databricks.koalas import DataFrame, Series fraction = get_option("plotting.sample_ratio") if fraction is None: fraction = 1 / (len(data) / get_option("plotting.max_rows")) fraction = min(1., fraction) self.fraction = fraction if isinstance(data, (DataFrame, Series)): if isinstance(data, Series): data = data.to_frame() sampled = data._sdf.sample(fraction=self.fraction) return DataFrame(data._internal.copy(sdf=sampled)).to_pandas() else: ValueError("Only DataFrame and Series are supported for plotting.")
def hint(self, name: str, *parameters) -> "ks.DataFrame": """ Specifies some hint on the current DataFrame. Parameters ---------- name : A name of the hint. parameters : Optional parameters. Returns ------- ret : DataFrame with the hint. See Also -------- broadcast : Marks a DataFrame as small enough for use in broadcast joins. Examples -------- >>> df1 = ks.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}, ... columns=['lkey', 'value']) >>> df2 = ks.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [5, 6, 7, 8]}, ... columns=['rkey', 'value']) >>> merged = df1.merge(df2.spark.hint("broadcast"), left_on='lkey', right_on='rkey') >>> merged.spark.explain() # doctest: +ELLIPSIS == Physical Plan == ... ...BroadcastHashJoin... ... """ from databricks.koalas.frame import DataFrame return DataFrame( self._kdf._internal.with_new_sdf( self._kdf._internal.spark_frame.hint(name, *parameters) ) )
def apply(self, func): """ Applies a function that takes and returns a Spark column. It allows to natively apply a Spark function and column APIs with the Spark column internally used in Series or Index. .. note:: It forces to lose the index and end up with using default index. It is preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply` with specifying the `inedx_col`. .. note:: It does not require to have the same length of the input and output. However, it requires to create a new DataFrame internally which will require to set `compute.ops_on_diff_frames` to compute even with the same origin DataFrame that is expensive, whereas :meth:`Series.spark.transform` does not require it. Parameters ---------- func : function Function to apply the function against the data by using Spark columns. Returns ------- Series Raises ------ ValueError : If the output from the function is not a Spark column. Examples -------- >>> from databricks import koalas as ks >>> from pyspark.sql.functions import count, lit >>> df = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"]) >>> df a b 0 1 4 1 2 5 2 3 6 >>> df.a.spark.apply(lambda c: count(c)) 0 3 Name: a, dtype: int64 >>> df.a.spark.apply(lambda c: c + df.b.spark.column) 0 5 1 7 2 9 Name: a, dtype: int64 """ from databricks.koalas import Index, DataFrame, Series from databricks.koalas.series import first_series from databricks.koalas.internal import HIDDEN_COLUMNS if isinstance(self._data, Index): raise NotImplementedError( "Index does not support spark.apply yet.") output = func(self._data.spark.column) if not isinstance(output, Column): raise ValueError("The output of the function [%s] should be of a " "pyspark.sql.Column; however, got [%s]." % (func, type(output))) assert isinstance(self._data, Series) sdf = self._data._internal.spark_frame.drop( *HIDDEN_COLUMNS).select(output) # Lose index. kdf = DataFrame(sdf) kdf.columns = [self._data.name] return first_series(kdf)
def combine_frames(this, *args, how="full"): """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from databricks.koalas import Series from databricks.koalas import DataFrame from databricks.koalas.config import get_option if all(isinstance(arg, Series) for arg in args): assert all(arg._kdf is args[0]._kdf for arg in args), \ "Currently only one different DataFrame (from given Series) is supported" if this is args[0]._kdf: return # We don't need to combine. All series is in this. that = args[0]._kdf[[ser.name for ser in args]] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) if this is args[0]: return # We don't need to combine. `this` and `that` are same. that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): this_index_map = this._internal.index_map that_index_map = that._internal.index_map assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # If the same named index is found, that's used. for this_column, this_name in this_index_map: for that_col, that_name in that_index_map: if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = this._internal.scol_for(this_column) that_scol = that._internal.scol_for(that_col) join_scol = this_scol == that_scol join_scols.append(join_scol) merged_index_scols.append( F.when( this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(this_column)) break else: raise ValueError( "Index names must be exactly matched currently.") assert len( join_scols) > 0, "cannot join with no overlapping index names" joined_df = this._sdf.alias("this").join(that._sdf.alias("that"), on=join_scols, how=how) joined_df = joined_df.select(merged_index_scols + [ this[idx]._scol.alias("__this_%s" % this._internal.column_name_for(idx)) for idx in this._internal.column_index ] + [ that[idx]._scol.alias("__that_%s" % that._internal.column_name_for(idx)) for idx in that._internal.column_index ]) index_columns = set(this._internal.index_columns) new_data_columns = [ c for c in joined_df.columns if c not in index_columns ] level = max(this._internal.column_index_level, that._internal.column_index_level) column_index = ([ tuple(['this'] + ([''] * (level - len(idx))) + list(idx)) for idx in this._internal.column_index ] + [ tuple(['that'] + ([''] * (level - len(idx))) + list(idx)) for idx in that._internal.column_index ]) column_index_names = ( (([None] * (1 + level - len(this._internal.column_index_level))) + this._internal.column_index_names) if this._internal.column_index_names is not None else None) return DataFrame( this._internal.copy(sdf=joined_df, data_columns=new_data_columns, column_index=column_index, column_index_names=column_index_names)) else: raise ValueError("Cannot combine column argument because " "it comes from a different dataframe")
def combine_frames(this, *args, how="full"): """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from databricks.koalas import Series from databricks.koalas import DataFrame from databricks.koalas.config import get_option if all(isinstance(arg, Series) for arg in args): assert all( arg._kdf is args[0]._kdf for arg in args ), "Currently only one different DataFrame (from given Series) is supported" if this is args[0]._kdf: return # We don't need to combine. All series is in this. that = args[0]._kdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) if this is args[0]: return # We don't need to combine. `this` and `that` are same. that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): this_index_map = this._internal.index_map that_index_map = that._internal.index_map assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = zip(this_index_map.items(), that_index_map.items()) # If the same named index is found, that's used. for (this_column, this_name), (that_column, that_name) in this_and_that_index_map: if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this._sdf, this_column) that_scol = scol_for(that._sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(this_column)) else: raise ValueError( "Index names must be exactly matched currently.") assert len( join_scols) > 0, "cannot join with no overlapping index names" joined_df = this._sdf.alias("this").join(that._sdf.alias("that"), on=join_scols, how=how) joined_df = joined_df.select(merged_index_scols + [ this[label]._scol.alias( "__this_%s" % this._internal.spark_column_name_for(label)) for label in this._internal.column_labels ] + [ that[label]._scol.alias( "__that_%s" % that._internal.spark_column_name_for(label)) for label in that._internal.column_labels ]) index_columns = set(this._internal.index_spark_column_names) new_data_columns = [ c for c in joined_df.columns if c not in index_columns ] level = max(this._internal.column_labels_level, that._internal.column_labels_level) column_labels = [ tuple(["this"] + ([""] * (level - len(label))) + list(label)) for label in this._internal.column_labels ] + [ tuple(["that"] + ([""] * (level - len(label))) + list(label)) for label in that._internal.column_labels ] column_label_names = ( (([None] * (1 + level - len(this._internal.column_labels_level))) + this._internal.column_label_names) if this._internal.column_label_names is not None else None) return DataFrame( this._internal.copy( spark_frame=joined_df, column_labels=column_labels, data_spark_columns=[ scol_for(joined_df, col) for col in new_data_columns ], column_label_names=column_label_names, )) else: raise ValueError( "Cannot combine the series or dataframe because it comes from a different dataframe. " "In order to allow this operation, enable 'compute.ops_on_diff_frames' option." )
def count(self): """ The expanding count of any non-NaN observations inside the window. .. note:: the current implementation of this API uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Returns ------- Series or DataFrame Returned object type is determined by the caller of the expanding calculation. See Also -------- Series.expanding : Calling object with Series data. DataFrame.expanding : Calling object with DataFrames. DataFrame.count : Count of the full DataFrame. Examples -------- >>> s = ks.Series([2, 3, float("nan"), 10]) >>> s.expanding().count() 0 1.0 1 2.0 2 2.0 3 3.0 Name: 0, dtype: float64 >>> s.to_frame().expanding().count() 0 0 1.0 1 2.0 2 2.0 3 3.0 """ from databricks.koalas import DataFrame, Series if isinstance(self.kdf_or_kser, Series): kser = self.kdf_or_kser # TODO: is this a bug? min_periods is not respected in expanding().count() in pandas. # scol = F.when( # F.row_number().over(self._window) > self._min_periods, # F.count(kser._scol).over(self._window) # ).otherwise(F.lit(None)) scol = F.count(kser._scol).over(self._window) return kser._with_new_scol(scol).astype('float64').rename(kser.name) elif isinstance(self.kdf_or_kser, DataFrame): # TODO: deduplicate with other APIs in expanding. kdf = self.kdf_or_kser applied = [] for column in kdf.columns: applied.append(kdf[column].expanding(self._min_periods).count()) sdf = kdf._sdf.select( kdf._internal.index_scols + [c._scol for c in applied]) internal = kdf._internal.copy( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], column_index=[c._internal.column_index[0] for c in applied]) return DataFrame(internal)
def combine_frames(this, *args, how="full"): """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `OPS_ON_DIFF_FRAMES` environment variable is not set, this method throws an exception. """ from databricks.koalas import Series from databricks.koalas import DataFrame if all(isinstance(arg, Series) for arg in args): assert all(arg._kdf is args[0]._kdf for arg in args), \ "Currently only one different DataFrame (from given Series) is supported" if this is args[0]._kdf: return # We don't need to combine. All series is in this. that = args[0]._kdf[[ser.name for ser in args]] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) if this is args[0]: return # We don't need to combine. `this` and `that` are same. that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if os.environ.get("OPS_ON_DIFF_FRAMES", "false").lower() == "true": this_index_map = this._internal.index_map this_data_columns = this._internal.data_columns that_index_map = that._internal.index_map that_data_columns = that._internal.data_columns assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # If the same named index is found, that's used. for this_column, this_name in this_index_map: for that_col, that_name in that_index_map: if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = this._internal.scol_for(this_column) that_scol = that._internal.scol_for(that_col) join_scol = this_scol == that_scol join_scols.append(join_scol) merged_index_scols.append( F.when( this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(this_column)) break else: raise ValueError( "Index names must be exactly matched currently.") assert len( join_scols) > 0, "cannot join with no overlapping index names" index_columns = this._internal.index_columns joined_df = this._sdf.alias("this").join(that._sdf.alias("that"), on=join_scols, how=how) joined_df = joined_df.select(merged_index_scols + [ this[c]._scol.alias("__this_%s" % this[c].name) for c in this_data_columns ] + [ that[c]._scol.alias("__that_%s" % that[c].name) for c in that_data_columns ]) new_data_columns = [ c for c in joined_df.columns if c not in index_columns ] return DataFrame( this._internal.copy(sdf=joined_df, data_columns=new_data_columns)) else: raise ValueError("Cannot combine column argument because " "it comes from a different dataframe")
def align_diff_frames(resolve_func, this, that, fillna=True, how="full"): """ This method aligns two different DataFrames with a given `func`. Columns are resolved and handled within the given `func`. To use this, `OPS_ON_DIFF_FRAMES` environment variable should be enabled, for now. :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and the column of another DataFrame. It returns an iterable that produces Series. >>> import os >>> >>> prev = os.environ.get("OPS_ON_DIFF_FRAMES", "false") >>> os.environ["OPS_ON_DIFF_FRAMES"] = "true" >>> >>> kdf1 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> kdf2 = ks.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]}) >>> >>> def func(kdf, this_columns, that_columns): ... kdf # conceptually this is A + B. ... ... # Within this function, Series from A or B can be performed against `kdf`. ... this_column = this_columns[0] # this is 'a' from kdf1. ... that_column = that_columns[0] # this is 'a' from kdf2. ... new_series = kdf[this_column] - kdf[that_column] ... ... # This new series will be placed in new DataFrame. ... yield new_series.rename(this_column) # or list(new_series) >>> >>> >>> align_diff_frames(func, kdf1, kdf2).sort_index() a 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 >>> os.environ["OPS_ON_DIFF_FRAMES"] = prev :param this: a DataFrame to align :param that: another DataFrame to align :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`. Otherwise, it returns as are. :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict. - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and 'that_columns' in this function are B, C and B, C. - left: `resolve_func` should resolve columns including that columns. For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is B, C but `that_columns` are B, C, D. :return: Alined DataFrame """ from databricks.koalas import DataFrame assert how == "full" or how == "left" this_data_columns = this._internal.data_columns that_data_columns = that._internal.data_columns common_columns = set(this_data_columns).intersection(that_data_columns) # 1. Full outer join given two dataframes. combined = combine_frames(this, that, how=how) # 2. Apply given function to transform the columns in a batch and keep the new columns. combined_data_columns = combined._internal.data_columns that_columns_to_apply = [] this_columns_to_apply = [] additional_that_columns = [] columns_to_keep = [] for combined_column in combined_data_columns: for common_column in common_columns: if combined_column == "__this_%s" % common_column: this_columns_to_apply.append(combined_column) break elif combined_column == "__that_%s" % common_column: that_columns_to_apply.append(combined_column) break else: if how == "left" and \ combined_column in ["__that_%s" % c for c in that_data_columns]: # In this case, we will drop `that_columns` in `columns_to_keep` but passes # it later to `func`. `func` should resolve it. # Note that adding this into a separate list (`additional_that_columns`) # is intentional so that `this_columns` and `that_columns` can be paired. additional_that_columns.append(combined_column) elif fillna: columns_to_keep.append( F.lit(None).cast(FloatType()).alias(combined_column)) else: columns_to_keep.append(F.col(combined_column)) that_columns_to_apply += additional_that_columns # Should extract columns to apply and do it in a batch in case # it adds new columns for example. kser_set = list( resolve_func(combined, this_columns_to_apply, that_columns_to_apply)) columns_applied = [c._scol for c in kser_set] sdf = combined._sdf.select(combined._internal.index_scols + columns_applied + columns_to_keep) # 3. Restore the names back and deduplicate columns. this_columns = OrderedDict() # Add columns in an order of its original frame. new_data_columns = [ c for c in sdf.columns if c not in combined._internal.index_columns ] for this_data_column in this_data_columns: for new_column in new_data_columns: striped = new_column if new_column.startswith("__this_") or new_column.startswith( "__that_"): striped = new_column[ 7:] # cut out the prefix (either __this_ or __that_). # Respect the applied columns first if there are duplicated columns found. if striped not in this_columns and this_data_column == striped: this_columns[striped] = F.col(new_column).alias(striped) break # After that, we will add the rest columns. other_columns = OrderedDict() for new_column in new_data_columns: striped = new_column if new_column.startswith("__this_") or new_column.startswith( "__that_"): striped = new_column[ 7:] # cut out the prefix (either __this_ or __that_). # Respect the applied columns first if there are duplicated columns found. if striped not in this_columns: other_columns[striped] = F.col(new_column).alias(striped) sdf = sdf.select(combined._internal.index_scols + list(this_columns.values()) + list(other_columns.values())) new_data_columns = [ c for c in sdf.columns if c not in combined._internal.index_columns ] internal = combined._internal.copy(sdf=sdf, data_columns=new_data_columns) return DataFrame(internal)