def _apply_as_series_or_frame(self, func): """ Wraps a function that handles Spark column in order to support it in both Koalas Series and DataFrame. Note that the given `func` name should be same as the API's method name. """ from databricks.koalas import DataFrame from databricks.koalas.series import _col from databricks.koalas.groupby import SeriesGroupBy kdf = self.kdf sdf = self.kdf._sdf # Here we need to include grouped key as an index, and shift previous index. # [index_column0, index_column1] -> [grouped key, index_column0, index_column1] new_index_scols = [] new_index_map = [] for groupkey in self._groupkeys: new_index_scols.append( # NOTE THAT this code intentionally uses `F.col` instead of `scol` in # given series. This is because, in case of series, we convert it into # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. F.col(name_like_string(groupkey.name) ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)), groupkey._internal.column_index[0])) for new_index_scol, index_map in zip(kdf._internal.index_scols, kdf._internal.index_map): new_index_scols.append( new_index_scol.alias( SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) _, name = index_map new_index_map.append( (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name)) applied = [] for column in kdf.columns: applied.append(kdf[column]._with_new_scol(func( kdf[column]._scol)).rename(kdf[column].name)) # Seems like pandas filters out when grouped key is NA. cond = self._groupkeys[0]._scol.isNotNull() for c in self._groupkeys: cond = cond | c._scol.isNotNull() sdf = sdf.select(new_index_scols + [c._scol for c in applied]).filter(cond) internal = _InternalFrame( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], index_map=new_index_map) ret = DataFrame(internal) if isinstance(self._groupby, SeriesGroupBy): return _col(ret) else: return ret
def test_from_pandas(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = _InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq(internal.index_map, OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})) self.assert_eq(internal.column_labels, [("a", ), ("b", )]) self.assert_eq(internal.data_spark_column_names, ["a", "b"]) self.assertTrue( internal.spark_column_for(("a", ))._jc.equals(sdf["a"]._jc)) self.assertTrue( internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index pdf.set_index("a", append=True, inplace=True) internal = _InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_map, OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None), (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]), ) self.assert_eq(internal.column_labels, [("b", )]) self.assert_eq(internal.data_spark_column_names, ["b"]) self.assertTrue( internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = _InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_map, OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None), (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]), ) self.assert_eq(internal.column_labels, [("x", "b")]) self.assert_eq(internal.data_spark_column_names, ["(x, b)"]) self.assertTrue( internal.spark_column_for( ("x", "b"))._jc.equals(sdf["(x, b)"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf)
def intersection(self, other) -> "MultiIndex": """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. Parameters ---------- other : Index or array-like Returns ------- intersection : MultiIndex Examples -------- >>> midx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> midx2 = ks.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) >>> midx1.intersection(midx2).sort_values() # doctest: +SKIP MultiIndex([('c', 'z')], ) """ if isinstance(other, Series) or not is_list_like(other): raise TypeError("other must be a MultiIndex or a list of tuples") elif isinstance(other, DataFrame): raise ValueError("Index data must be 1-dimensional") elif isinstance(other, MultiIndex): spark_frame_other = other.to_frame().to_spark() keep_name = self.names == other.names elif isinstance(other, Index): # Always returns an empty MultiIndex if `other` is Index. return self.to_frame().head(0).index # type: ignore elif not all(isinstance(item, tuple) for item in other): raise TypeError("other must be a MultiIndex or a list of tuples") else: other = MultiIndex.from_tuples(list(other)) spark_frame_other = other.to_frame().to_spark() keep_name = True default_name = [ SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels) ] spark_frame_self = self.to_frame(name=default_name).to_spark() spark_frame_intersected = spark_frame_self.intersect(spark_frame_other) if keep_name: index_names = self._internal.index_names else: index_names = None internal = InternalFrame( # TODO: dtypes? spark_frame=spark_frame_intersected, index_spark_columns=[ scol_for(spark_frame_intersected, col) for col in default_name ], index_names=index_names, ) return cast(MultiIndex, DataFrame(internal).index)
def test_from_pandas(self): pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None)]) self.assert_eq(internal.column_index, [('a', ), ('b', )]) self.assert_eq(internal.data_columns, ['a', 'b']) self.assertTrue(internal.scol_for(('a',))._jc.equals(sdf['a']._jc)) self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc)) self.assert_eq(internal.pandas_df, pdf) # multi-index pdf.set_index('a', append=True, inplace=True) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None), ('a', ('a',))]) self.assert_eq(internal.column_index, [('b', )]) self.assert_eq(internal.data_columns, ['b']) self.assertTrue(internal.scol_for(('b',))._jc.equals(sdf['b']._jc)) self.assert_eq(internal.pandas_df, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([('x', 'b')]) internal = _InternalFrame.from_pandas(pdf) sdf = internal.sdf self.assert_eq(internal.index_map, [(SPARK_INDEX_NAME_FORMAT(0), None), ('a', ('a',))]) self.assert_eq(internal.column_index, [('x', 'b')]) self.assert_eq(internal.data_columns, ['(x, b)']) self.assertTrue(internal.scol_for(('x', 'b'))._jc.equals(sdf['(x, b)']._jc)) self.assert_eq(internal.pandas_df, pdf)
def combine_frames(this, *args, how="full", preserve_order_column=False): """ This method combines `this` DataFrame with a different `that` DataFrame or Series from a different DataFrame. It returns a DataFrame that has prefix `this_` and `that_` to distinct the columns names from both DataFrames It internally performs a join operation which can be expensive in general. So, if `compute.ops_on_diff_frames` option is False, this method throws an exception. """ from databricks.koalas.config import get_option from databricks.koalas.frame import DataFrame from databricks.koalas.internal import ( InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, SPARK_INDEX_NAME_FORMAT, ) from databricks.koalas.series import Series if all(isinstance(arg, Series) for arg in args): assert all( same_anchor(arg, args[0]) for arg in args ), "Currently only one different DataFrame (from given Series) is supported" assert not same_anchor( this, args[0]), "We don't need to combine. All series is in this." that = args[0]._kdf[list(args)] elif len(args) == 1 and isinstance(args[0], DataFrame): assert isinstance(args[0], DataFrame) assert not same_anchor( this, args[0]), "We don't need to combine. `this` and `that` are same." that = args[0] else: raise AssertionError("args should be single DataFrame or " "single/multiple Series") if get_option("compute.ops_on_diff_frames"): def resolve(internal, side): rename = lambda col: "__{}_{}".format(side, col) internal = internal.resolved_copy sdf = internal.spark_frame sdf = internal.spark_frame.select([ scol_for(sdf, col).alias(rename(col)) for col in sdf.columns if col not in HIDDEN_COLUMNS ] + list(HIDDEN_COLUMNS)) return internal.copy( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.index_spark_column_names ], data_spark_columns=[ scol_for(sdf, rename(col)) for col in internal.data_spark_column_names ], ) this_internal = resolve(this._internal, "this") that_internal = resolve(that._internal, "that") this_index_map = list( zip(this_internal.index_spark_column_names, this_internal.index_names)) that_index_map = list( zip(that_internal.index_spark_column_names, that_internal.index_names)) assert len(this_index_map) == len(that_index_map) join_scols = [] merged_index_scols = [] # Note that the order of each element in index_map is guaranteed according to the index # level. this_and_that_index_map = list(zip(this_index_map, that_index_map)) this_sdf = this_internal.spark_frame.alias("this") that_sdf = that_internal.spark_frame.alias("that") # If the same named index is found, that's used. index_column_names = [] for i, ((this_column, this_name), (that_column, that_name)) in enumerate(this_and_that_index_map): if this_name == that_name: # We should merge the Spark columns into one # to mimic pandas' behavior. this_scol = scol_for(this_sdf, this_column) that_scol = scol_for(that_sdf, that_column) join_scol = this_scol == that_scol join_scols.append(join_scol) column_name = SPARK_INDEX_NAME_FORMAT(i) index_column_names.append(column_name) merged_index_scols.append( F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name)) else: raise ValueError( "Index names must be exactly matched currently.") assert len( join_scols) > 0, "cannot join with no overlapping index names" joined_df = this_sdf.join(that_sdf, on=join_scols, how=how) if preserve_order_column: order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)] else: order_column = [] joined_df = joined_df.select(merged_index_scols + [ scol_for(this_sdf, this_internal.spark_column_name_for(label)) for label in this_internal.column_labels ] + [ scol_for(that_sdf, that_internal.spark_column_name_for(label)) for label in that_internal.column_labels ] + order_column) index_columns = set(index_column_names) new_data_columns = [ col for col in joined_df.columns if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME ] level = max(this_internal.column_labels_level, that_internal.column_labels_level) def fill_label(label): if label is None: return ([""] * (level - 1)) + [None] else: return ([""] * (level - len(label))) + list(label) column_labels = [ tuple(["this"] + fill_label(label)) for label in this_internal.column_labels ] + [ tuple(["that"] + fill_label(label)) for label in that_internal.column_labels ] column_label_names = ([None] * (1 + level - this_internal.column_labels_level) ) + this_internal.column_label_names return DataFrame( InternalFrame( spark_frame=joined_df, index_spark_columns=[ scol_for(joined_df, col) for col in index_column_names ], index_names=this_internal.index_names, column_labels=column_labels, data_spark_columns=[ scol_for(joined_df, col) for col in new_data_columns ], column_label_names=column_label_names, )) else: raise ValueError( "Cannot combine the series or dataframe because it comes from a different dataframe. " "In order to allow this operation, enable 'compute.ops_on_diff_frames' option." )
def test_from_pandas(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [("a",), ("b",)]) self.assert_eq(internal.data_spark_column_names, ["a", "b"]) self.assertTrue(internal.spark_column_for(("a",))._jc.equals(sdf["a"]._jc)) self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf) # non-string column name pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf1) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [(0,), (1,)]) self.assert_eq(internal.data_spark_column_names, ["0", "1"]) self.assertTrue(internal.spark_column_for((0,))._jc.equals(sdf["0"]._jc)) self.assertTrue(internal.spark_column_for((1,))._jc.equals(sdf["1"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf1) # multi-index pdf.set_index("a", append=True, inplace=True) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a",)]) self.assert_eq(internal.column_labels, [("b",)]) self.assert_eq(internal.data_spark_column_names, ["b"]) self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a",)]) self.assert_eq(internal.column_labels, [("x", "b")]) self.assert_eq(internal.data_spark_column_names, ["(x, b)"]) self.assertTrue(internal.spark_column_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc)) self.assert_eq(internal.to_pandas_frame, pdf)
def attach_id_column(self, id_type: str, column: Union[str, Tuple[str, ...]]) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ks.DataFrame({"x": ['a', 'b', 'c']}) >>> df.koalas.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.koalas.attach_id_column(id_type="distributed-sequence", column="id").sort_index() x id 0 a 0 1 b 1 2 c 2 >>> df.koalas.attach_id_column(id_type="distributed", column="id") ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x id 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ks.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.koalas.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 """ from databricks.koalas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) if isinstance(column, str): column = (column, ) else: assert isinstance(column, tuple), type(column) internal = self._kdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns." .format(column)) elif column in internal.column_labels: raise ValueError("The given column `{}` already exists.".format( name_like_string(column))) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select([ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip( internal.data_spark_columns, internal.column_labels) ]) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_map=OrderedDict([ (SPARK_INDEX_NAME_FORMAT(i), name) for i, name in enumerate(internal.index_names) ]), column_labels=internal.column_labels + [column], data_spark_columns=([ scol_for(sdf, name_like_string(label)) for label in internal.column_labels ] + [scol_for(sdf, name_like_string(column))]), column_label_names=internal.column_label_names, ).resolved_copy)
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- For Series >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 For Index >>> from databricks.koalas.indexes import Index >>> idx = Index([3, 1, 2, 3, 4, np.nan]) >>> idx Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64') >>> idx.value_counts().sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **sort** With `sort` set to `False`, the result wouldn't be sorted by number of count. >>> idx.value_counts(sort=True).sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **normalize** With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> idx.value_counts(normalize=True).sort_index() 1.0 0.2 2.0 0.2 3.0 0.4 4.0 0.2 Name: count, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> idx.value_counts(dropna=False).sort_index() # doctest: +SKIP 1.0 1 2.0 1 3.0 2 4.0 1 NaN 1 Name: count, dtype: int64 For MultiIndex. >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [1, 1, 1, 1, 1, 2, 1, 2, 2]]) >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) >>> s.index # doctest: +SKIP MultiIndex([( 'lama', 'weight'), ( 'lama', 'weight'), ( 'lama', 'weight'), ( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'length'), ('falcon', 'weight'), ('falcon', 'length'), ('falcon', 'length')], ) >>> s.index.value_counts().sort_index() (cow, length) 1 (cow, weight) 2 (falcon, length) 2 (falcon, weight) 1 (lama, weight) 3 Name: count, dtype: int64 >>> s.index.value_counts(normalize=True).sort_index() (cow, length) 0.111111 (cow, weight) 0.222222 (falcon, length) 0.222222 (falcon, weight) 0.111111 (lama, weight) 0.333333 Name: count, dtype: float64 If Index has name, keep the name up. >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas') >>> idx.value_counts().sort_index() 0 3 1 2 2 1 3 1 Name: koalas, dtype: int64 """ from databricks.koalas.series import Series, _col if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._internal._sdf.dropna() else: sdf_dropna = self._internal._sdf index_name = SPARK_INDEX_NAME_FORMAT(0) sdf = sdf_dropna.groupby(self._scol.alias(index_name)).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) column_index = self._internal.column_index if (column_index[0] is None) or (None in column_index[0]): internal = _InternalFrame(sdf=sdf, index_map=[(index_name, None)], column_scols=[scol_for(sdf, 'count')]) else: internal = _InternalFrame( sdf=sdf, index_map=[(index_name, None)], column_index=column_index, column_scols=[scol_for(sdf, 'count')], column_index_names=self._internal.column_index_names) return _col(DataFrame(internal))