def _init_from_spark(self, sdf, metadata=None, *args): self._sdf = sdf if metadata is None: self._metadata = Metadata( column_fields=self._sdf.schema.fieldNames()) else: self._metadata = metadata
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] sdf = self._kdf._sdf column_fields = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) column_fields.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) column_fields.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) metadata = Metadata(column_fields=column_fields, index_info=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
def from_pandas(self, pdf): if isinstance(pdf, pd.Series): return _col(self.from_pandas(pd.DataFrame(pdf))) metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields df = self.createDataFrame(reset_index) df._metadata = metadata return df
def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields schema = StructType([StructField(name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any())) for name, col in reset_index.iteritems()]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) self._init_from_spark(default_session().createDataFrame(reset_index, schema=schema), metadata)
def from_pandas(pdf): """Create DataFrame from pandas DataFrame. This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks the index in the given pandas DataFrame. :param pdf: :class:`pandas.DataFrame` """ if isinstance(pdf, pd.Series): return _col(from_pandas(pd.DataFrame(pdf))) metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields df = default_session().createDataFrame(reset_index) df._metadata = metadata return df
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError("value_counts currently does not support bins") if dropna: df_dropna = self._pandas_anchor._spark_filter(self.notna()) else: df_dropna = self._pandas_anchor df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn('count', F._spark_col('count') / F._spark_lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' df.columns = [index_name, self.name] df._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(df)
def to_dataframe(self): sdf = self._kdf._sdf.select([field for field, _ in self._index_info] + [self._scol]) metadata = Metadata(column_fields=[sdf.schema[-1].name], index_info=self._index_info) return DataFrame(sdf, metadata)
def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields self._init_from_spark(default_session().createDataFrame(reset_index), metadata)
class DataFrame(_Frame): """ Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame internally. :ivar _sdf: Spark Column instance :ivar _metadata: Metadata related to column names and index information. """ @derived_from(pd.DataFrame) @dispatch_on('data') def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy) self._init_from_pandas(pdf) @__init__.register(pd.DataFrame) def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields self._init_from_spark(default_session().createDataFrame(reset_index), metadata) @__init__.register(spark.DataFrame) def _init_from_spark(self, sdf, metadata=None, *args): self._sdf = sdf if metadata is None: self._metadata = Metadata( column_fields=self._sdf.schema.fieldNames()) else: self._metadata = metadata @property def _index_columns(self): return [ self._sdf.__getitem__(field) for field in self._metadata.index_fields ] def _reduce_for_stat_function(self, sfun): sdf = self._sdf.select( [sfun(self._sdf[col]).alias(col) for col in self.columns]) pdf = sdf.toPandas() assert len(pdf) == 1, (sdf, pdf) row = pdf.iloc[0] row.name = None return row # Return first row as a Series def corr(self, method='pearson'): """ Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- method : {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation Returns ------- y : pandas.DataFrame See Also -------- Series.corr Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.corr('pearson') dogs cats dogs 1.000000 -0.851064 cats -0.851064 1.000000 >>> df.corr('spearman') dogs cats dogs 1.000000 -0.948683 cats -0.948683 1.000000 Notes ----- There are behavior differences between Koalas and pandas. * the `method` argument only accepts 'pearson', 'spearman' * the data should not contain NaNs. Koalas will return an error. * Koalas doesn't support the following argument(s). * `min_periods` argument is not supported """ return corr(self, method) @derived_from(pd.DataFrame) def iteritems(self): cols = list(self.columns) return list((col_name, self[col_name]) for col_name in cols) @derived_from(pd.DataFrame) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False): return self.toPandas().to_html(buf=buf, columns=columns, col_space=col_space, header=header, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook, border=border, table_id=table_id, render_links=render_links) @property def index(self): """The index (row labels) Column of the DataFrame. Currently supported only when the DataFrame has a single index. """ from databricks.koalas.series import Series if len(self._metadata.index_info) != 1: raise KeyError( 'Currently supported only when the DataFrame has a single index.' ) return Series(self._index_columns[0], self, []) def set_index(self, keys, drop=True, append=False, inplace=False): """Set the DataFrame index (row labels) using one or more existing columns. By default yields a new object. :param keys: column label or list of column labels / arrays :param drop: boolean, default True Delete columns to be used as the new index :param append: boolean, default False Whether to append columns to existing index :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if isinstance(keys, string_types): keys = [keys] else: keys = list(keys) for key in keys: if key not in self.columns: raise KeyError(key) if drop: columns = [ column for column in self._metadata.column_fields if column not in keys ] else: columns = self._metadata.column_fields if append: index_info = self._metadata.index_info + [(column, column) for column in keys] else: index_info = [(column, column) for column in keys] metadata = self._metadata.copy(column_fields=columns, index_info=index_info) if inplace: self._metadata = metadata else: kdf = self.copy() kdf._metadata = metadata return kdf def reset_index(self, level=None, drop=False, inplace=False): """For DataFrame with multi-level index, return new DataFrame with labeling information in the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None. For a standard index, the index name will be used (if set), otherwise a default 'index' or 'level_0' (if 'index' is already taken) will be used. :param level: int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by default :param drop: boolean, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if len(self._metadata.index_info) == 0: raise NotImplementedError( 'Can\'t reset index because there is no index.') multi_index = len(self._metadata.index_info) > 1 if multi_index: rename = lambda i: 'level_{}'.format(i) else: rename = lambda i: \ 'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i) if level is None: index_columns = [ (column, name if name is not None else rename(i)) for i, (column, name) in enumerate(self._metadata.index_info) ] index_info = [] else: if isinstance(level, (int, string_types)): level = [level] level = list(level) if all(isinstance(l, int) for l in level): for l in level: if l >= len(self._metadata.index_info): raise IndexError( 'Too many levels: Index has only {} level, not {}'. format(len(self._metadata.index_info), l + 1)) idx = level elif all(isinstance(l, string_types) for l in level): idx = [] for l in level: try: i = self._metadata.index_fields.index(l) idx.append(i) except ValueError: if multi_index: raise KeyError('Level unknown not found') else: raise KeyError( 'Level unknown must be same as name ({})'. format(self._metadata.index_fields[0])) else: raise ValueError('Level should be all int or all string.') idx.sort() index_columns = [] index_info = self._metadata.index_info.copy() for i in idx: info = self._metadata.index_info[i] column_field, index_name = info index_columns.append((column_field, index_name if index_name is not None else rename(index_name))) index_info.remove(info) if drop: index_columns = [] metadata = self._metadata.copy( column_fields=[column for column, _ in index_columns] + self._metadata.column_fields, index_info=index_info) columns = [name for _, name in index_columns] + self._metadata.column_fields if inplace: self._metadata = metadata self.columns = columns else: kdf = self.copy() kdf._metadata = metadata kdf.columns = columns return kdf @derived_from(pd.DataFrame) def isnull(self): kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.isnull() return kdf isna = isnull @derived_from(pd.DataFrame) def notnull(self): kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.notnull() return kdf notna = notnull @derived_from(spark.DataFrame) def toPandas(self): sdf = self._sdf.select( ['`{}`'.format(name) for name in self._metadata.all_fields]) pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: # TODO: push to OSS pdf = pdf.astype({ field.name: to_arrow_type(field.dataType).to_pandas_dtype() for field in sdf.schema }) if len(self._metadata.index_info) > 0: append = False for index_field in self._metadata.index_fields: drop = index_field not in self._metadata.column_fields pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[self._metadata.column_fields] index_names = self._metadata.index_names if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] return pdf @derived_from(pd.DataFrame) def assign(self, **kwargs): from databricks.koalas.series import Series for k, v in kwargs.items(): if not (isinstance(v, (Series, spark.Column)) or callable(v) or pd.api.types.is_scalar(v)): raise TypeError("Column assignment doesn't support type " "{0}".format(type(v).__name__)) if callable(v): kwargs[k] = v(self) pairs = list(kwargs.items()) sdf = self._sdf for (name, c) in pairs: if isinstance(c, Series): sdf = sdf.withColumn(name, c._scol) else: sdf = sdf.withColumn(name, c) metadata = self._metadata.copy( column_fields=(self._metadata.column_fields + [ name for name, _ in pairs if name not in self._metadata.column_fields ])) return DataFrame(sdf, metadata) @property def loc(self): return SparkDataFrameLocator(self) def copy(self): return DataFrame(self._sdf, self._metadata.copy()) @derived_from(pd.DataFrame) def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): if axis == 0 or axis == 'index': if subset is not None: if isinstance(subset, string_types): columns = [subset] else: columns = list(subset) invalids = [ column for column in columns if column not in self._metadata.column_fields ] if len(invalids) > 0: raise KeyError(invalids) else: columns = list(self.columns) cnt = reduce(lambda x, y: x + y, [ F.when(self[column].notna()._scol, 1).otherwise(0) for column in columns ], F.lit(0)) if thresh is not None: pred = cnt >= F.lit(int(thresh)) elif how == 'any': pred = cnt == F.lit(len(columns)) elif how == 'all': pred = cnt > F.lit(0) else: if how is not None: raise ValueError('invalid how option: {h}'.format(h=how)) else: raise TypeError('must specify how or thresh') sdf = self._sdf.filter(pred) if inplace: self._sdf = sdf else: return DataFrame(sdf, self._metadata.copy()) else: raise NotImplementedError( "dropna currently only works for axis=0 or axis='index'") def head(self, n=5): """ Return the first `n` rows. This function returns the first `n` rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- obj_head : same type as caller The first `n` rows of the caller object. Examples -------- >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the first 5 lines >>> df.head() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey Viewing the first `n` lines (three in this case) >>> df.head(3) animal 0 alligator 1 bee 2 falcon """ return DataFrame(self._sdf.limit(n), self._metadata.copy()) @property def columns(self): """The column labels of the DataFrame.""" return pd.Index(self._metadata.column_fields) @columns.setter def columns(self, names): old_names = self._metadata.column_fields if len(old_names) != len(names): raise ValueError( "Length mismatch: Expected axis has %d elements, new values have %d elements" % (len(old_names), len(names))) sdf = self._sdf.select(self._metadata.index_fields + [ self[old_name]._scol.alias(new_name) for (old_name, new_name) in zip(old_names, names) ]) self._sdf = sdf self._metadata = self._metadata.copy(column_fields=names) @property def dtypes(self): """Return the dtypes in the DataFrame. This returns a Series with the data type of each column. The result's index is the original DataFrame's columns. Columns with mixed types are stored with the object dtype. :return: :class:`pd.Series` The data type of each column. Examples -------- >>> df = ks.DataFrame({'a': list('abc'), ... 'b': list(range(1, 4)), ... 'c': np.arange(3, 6).astype('i1'), ... 'd': np.arange(4.0, 7.0, dtype='float64'), ... 'e': [True, False, True], ... 'f': pd.date_range('20130101', periods=3)}) >>> df.dtypes a object b int64 c int64 d float64 e bool f datetime64[ns] dtype: object """ return pd.Series( [self[col].dtype for col in self._metadata.column_fields], index=self._metadata.column_fields) def count(self): """ Count non-NA cells for each column or row. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending on `pandas.options.mode.use_inf_as_na`) are considered NA. Returns ------- Series or DataFrame For each column/row the number of non-NA/null entries. If `level` is specified returns a `DataFrame`. See Also -------- Series.count: Number of non-NA elements in a Series. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA elements. Examples -------- Constructing DataFrame from a dictionary: >>> df = pd.DataFrame({"Person": ... ["John", "Myla", "Lewis", "John", "Myla"], ... "Age": [24., np.nan, 21., 33, 26], ... "Single": [False, True, True, True, False]}) >>> df Person Age Single 0 John 24.0 False 1 Myla NaN True 2 Lewis 21.0 True 3 John 33.0 True 4 Myla 26.0 False Notice the uncounted NA values: >>> df.count() Person 5 Age 4 Single 5 dtype: int64 Counts for each **row**: """ return self._sdf.count() def unique(self): sdf = self._sdf return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx), self._metadata.copy()) @derived_from(pd.DataFrame) def drop(self, labels, axis=0, errors='raise'): axis = self._validate_axis(axis) if axis == 1: if isinstance(labels, list): sdf = self._sdf.drop(*labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column not in labels ]) else: sdf = self._sdf.drop(labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column != labels ]) return DataFrame(sdf, metadata) raise NotImplementedError("Drop currently only works for axis=1") @derived_from(pd.DataFrame) def get(self, key, default=None): try: return self._pd_getitem(key) except (KeyError, ValueError, IndexError): return default def sort_values(self, by): return DataFrame(self._sdf.sort(by), self._metadata.copy()) def groupby(self, by): from databricks.koalas.groups import PandasLikeGroupBy gp = self._sdf.groupby(by) return PandasLikeGroupBy(self, gp, None) @derived_from(pd.DataFrame) def pipe(self, func, *args, **kwargs): # Taken from pandas: # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707 if isinstance(func, tuple): func, target = func if target in kwargs: raise ValueError('%s is both the pipe target and a keyword ' 'argument' % target) kwargs[target] = self return func(*args, **kwargs) else: return func(self, *args, **kwargs) @property def shape(self): """ Return a tuple representing the dimensionality of the DataFrame. Examples -------- >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.shape (2, 2) >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4], ... 'col3': [5, 6]}) >>> df.shape (2, 3) """ return len(self), len(self.columns) def _pd_getitem(self, key): from databricks.koalas.series import Series if key is None: raise KeyError("none key") if isinstance(key, string_types): try: return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) except AnalysisException: raise KeyError(key) if np.isscalar(key) or isinstance(key, (tuple, string_types)): raise NotImplementedError(key) elif isinstance(key, slice): return self.loc[key] if isinstance(key, (pd.Series, np.ndarray, pd.Index)): raise NotImplementedError(key) if isinstance(key, list): return self.loc[:, key] if isinstance(key, DataFrame): # TODO Should not implement alignment, too dangerous? return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) if isinstance(key, Series): # TODO Should not implement alignment, too dangerous? # It is assumed to be only a filter, otherwise .loc should be used. bcol = key._scol.cast("boolean") return DataFrame(self._sdf.filter(bcol), self._metadata.copy()) raise NotImplementedError(key) def __repr__(self): return repr(self.toPandas()) def __getitem__(self, key): return self._pd_getitem(key) def __setitem__(self, key, value): from databricks.koalas.series import Series # For now, we don't support realignment against different dataframes. # This is too expensive in Spark. # Are we assigning against a column? if isinstance(value, Series): assert value._kdf is self, \ "Cannot combine column argument because it comes from a different dataframe" if isinstance(key, (tuple, list)): assert isinstance(value.schema, StructType) field_names = value.schema.fieldNames() kdf = self.assign( **{k: value[c] for k, c in zip(key, field_names)}) else: kdf = self.assign(**{key: value}) self._sdf = kdf._sdf self._metadata = kdf._metadata def __getattr__(self, key): from databricks.koalas.series import Series if key.startswith("__") or key.startswith( "_pandas_") or key.startswith("_spark_"): raise AttributeError(key) if hasattr(_MissingPandasLikeDataFrame, key): return partial(getattr(_MissingPandasLikeDataFrame, key), self) return Series(self._sdf.__getattr__(key), self, self._metadata.index_info) def __iter__(self): return self.toPandas().__iter__() def __len__(self): return self._sdf.count() def __dir__(self): fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f] return super(DataFrame, self).__dir__() + fields def _repr_html_(self): return self.head(max_display_count).toPandas()._repr_html_() @classmethod def _validate_axis(cls, axis=0): if axis not in (0, 1, 'index', 'columns', None): raise ValueError('No axis named {0}'.format(axis)) # convert to numeric axis return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 """ if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
class DataFrame(_Frame): """ Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame internally. :ivar _sdf: Spark Column instance :ivar _metadata: Metadata related to column names and index information. """ @derived_from(pd.DataFrame) @dispatch_on('data') def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy) self._init_from_pandas(pdf) @__init__.register(pd.DataFrame) def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields schema = StructType([ StructField(name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any())) for name, col in reset_index.iteritems() ]) for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) self._init_from_spark( default_session().createDataFrame(reset_index, schema=schema), metadata) @__init__.register(spark.DataFrame) def _init_from_spark(self, sdf, metadata=None, *args): self._sdf = sdf if metadata is None: self._metadata = Metadata( column_fields=self._sdf.schema.fieldNames()) else: self._metadata = metadata @property def _index_columns(self): return [ self._sdf.__getitem__(field) for field in self._metadata.index_fields ] def _reduce_for_stat_function(self, sfun): """ Applies sfun to each column and returns a pd.Series where the number of rows equal the number of columns. :param sfun: either an 1-arg function that takes a Column and returns a Column, or a 2-arg function that takes a Column and its DataType and returns a Column. """ from inspect import signature exprs = [] num_args = len(signature(sfun).parameters) for col in self.columns: col_sdf = self._sdf[col] col_type = self._sdf.schema[col].dataType if isinstance(col_type, BooleanType) and sfun.__name__ not in ('min', 'max'): # Stat functions cannot be used with boolean values by default # Thus, cast to integer (true to 1 and false to 0) # Exclude the min and max methods though since those work with booleans col_sdf = col_sdf.cast('integer') if num_args == 1: # Only pass in the column if sfun accepts only one arg col_sdf = sfun(col_sdf) else: # must be 2 assert num_args == 2 # Pass in both the column and its data type if sfun accepts two args col_sdf = sfun(col_sdf, col_type) exprs.append(col_sdf.alias(col)) sdf = self._sdf.select(*exprs) pdf = sdf.toPandas() assert len(pdf) == 1, (sdf, pdf) row = pdf.iloc[0] row.name = None return row # Return first row as a Series def corr(self, method='pearson'): """ Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- method : {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation Returns ------- y : pandas.DataFrame See Also -------- Series.corr Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.corr('pearson') dogs cats dogs 1.000000 -0.851064 cats -0.851064 1.000000 >>> df.corr('spearman') dogs cats dogs 1.000000 -0.948683 cats -0.948683 1.000000 Notes ----- There are behavior differences between Koalas and pandas. * the `method` argument only accepts 'pearson', 'spearman' * the data should not contain NaNs. Koalas will return an error. * Koalas doesn't support the following argument(s). * `min_periods` argument is not supported """ return corr(self, method) @derived_from(pd.DataFrame) def iteritems(self): cols = list(self.columns) return list((col_name, self[col_name]) for col_name in cols) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False): """ Render a DataFrame as an HTML table. .. note:: This method should only be used if the resulting Pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, set max_rows parameter. Parameters ---------- buf : StringIO-like, optional Buffer to write to. columns : sequence, optional, default None The subset of columns to write. Writes all columns by default. col_space : int, optional The minimum width of each column. header : bool, optional Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' String representation of NAN to use. formatters : list or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are floats. The result of this function must be a unicode string. sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. index_names : bool, optional, default True Prints the names of the indexes. justify : str, default None How to justify the column labels. If None uses the option from the print configuration (controlled by set_option), 'right' out of the box. Valid values are * left * right * center * justify * justify-all * start * end * inherit * match-parent * initial * unset. max_rows : int, optional Maximum number of rows to display in the console. max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). decimal : str, default '.' Character recognized as decimal separator, e.g. ',' in Europe. bold_rows : bool, default True Make the row labels bold in the output. classes : str or list or tuple, default None CSS class(es) to apply to the resulting html table. escape : bool, default True Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. border : int A ``border=border`` attribute is included in the opening `<table>` tag. Default ``pd.options.html.border``. table_id : str, optional A css id is included in the opening `<table>` tag if specified. render_links : bool, default False Convert URLs to HTML links (only works with Pandas 0.24+). Returns ------- str (or unicode, depending on data and options) String representation of the dataframe. See Also -------- to_string : Convert DataFrame to a string. """ # Make sure locals() call is at the top of the function so we don't capture local variables. args = locals() if max_rows is not None: kdf = self.head(max_rows) else: kdf = self return validate_arguments_and_invoke_function(kdf.to_pandas(), self.to_html, pd.DataFrame.to_html, args) def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', line_width=None): """ Render a DataFrame to a console-friendly tabular output. .. note:: This method should only be used if the resulting Pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, set max_rows parameter. Parameters ---------- buf : StringIO-like, optional Buffer to write to. columns : sequence, optional, default None The subset of columns to write. Writes all columns by default. col_space : int, optional The minimum width of each column. header : bool, optional Write out the column names. If a list of strings is given, it is assumed to be aliases for the column names index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' String representation of NAN to use. formatters : list or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are floats. The result of this function must be a unicode string. sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. index_names : bool, optional, default True Prints the names of the indexes. justify : str, default None How to justify the column labels. If None uses the option from the print configuration (controlled by set_option), 'right' out of the box. Valid values are * left * right * center * justify * justify-all * start * end * inherit * match-parent * initial * unset. max_rows : int, optional Maximum number of rows to display in the console. max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False Display DataFrame dimensions (number of rows by number of columns). decimal : str, default '.' Character recognized as decimal separator, e.g. ',' in Europe. line_width : int, optional Width to wrap a line in characters. Returns ------- str (or unicode, depending on data and options) String representation of the dataframe. See Also -------- to_html : Convert DataFrame to HTML. Examples -------- >>> df = ks.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}) >>> print(df.to_string()) col1 col2 0 1 4 1 2 5 2 3 6 >>> print(df.to_string(max_rows=2)) col1 col2 0 1 4 1 2 5 """ # Make sure locals() call is at the top of the function so we don't capture local variables. args = locals() if max_rows is not None: kdf = self.head(max_rows) else: kdf = self return validate_arguments_and_invoke_function(kdf.to_pandas(), self.to_string, pd.DataFrame.to_string, args) @property def index(self): """The index (row labels) Column of the DataFrame. Currently supported only when the DataFrame has a single index. """ from databricks.koalas.series import Series if len(self._metadata.index_info) != 1: raise KeyError( 'Currently supported only when the DataFrame has a single index.' ) return Series(self._index_columns[0], self, []) def set_index(self, keys, drop=True, append=False, inplace=False): """Set the DataFrame index (row labels) using one or more existing columns. By default yields a new object. :param keys: column label or list of column labels / arrays :param drop: boolean, default True Delete columns to be used as the new index :param append: boolean, default False Whether to append columns to existing index :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if isinstance(keys, string_types): keys = [keys] else: keys = list(keys) for key in keys: if key not in self.columns: raise KeyError(key) if drop: columns = [ column for column in self._metadata.column_fields if column not in keys ] else: columns = self._metadata.column_fields if append: index_info = self._metadata.index_info + [(column, column) for column in keys] else: index_info = [(column, column) for column in keys] metadata = self._metadata.copy(column_fields=columns, index_info=index_info) if inplace: self._metadata = metadata else: kdf = self.copy() kdf._metadata = metadata return kdf def reset_index(self, level=None, drop=False, inplace=False): """For DataFrame with multi-level index, return new DataFrame with labeling information in the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None. For a standard index, the index name will be used (if set), otherwise a default 'index' or 'level_0' (if 'index' is already taken) will be used. :param level: int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by default :param drop: boolean, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if len(self._metadata.index_info) == 0: raise NotImplementedError( 'Can\'t reset index because there is no index.') multi_index = len(self._metadata.index_info) > 1 if multi_index: rename = lambda i: 'level_{}'.format(i) else: rename = lambda i: \ 'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i) if level is None: index_columns = [ (column, name if name is not None else rename(i)) for i, (column, name) in enumerate(self._metadata.index_info) ] index_info = [] else: if isinstance(level, (int, string_types)): level = [level] level = list(level) if all(isinstance(l, int) for l in level): for l in level: if l >= len(self._metadata.index_info): raise IndexError( 'Too many levels: Index has only {} level, not {}'. format(len(self._metadata.index_info), l + 1)) idx = level elif all(isinstance(l, string_types) for l in level): idx = [] for l in level: try: i = self._metadata.index_fields.index(l) idx.append(i) except ValueError: if multi_index: raise KeyError('Level unknown not found') else: raise KeyError( 'Level unknown must be same as name ({})'. format(self._metadata.index_fields[0])) else: raise ValueError('Level should be all int or all string.') idx.sort() index_columns = [] index_info = self._metadata.index_info.copy() for i in idx: info = self._metadata.index_info[i] column_field, index_name = info index_columns.append((column_field, index_name if index_name is not None else rename(index_name))) index_info.remove(info) if drop: index_columns = [] metadata = self._metadata.copy( column_fields=[column for column, _ in index_columns] + self._metadata.column_fields, index_info=index_info) columns = [name for _, name in index_columns] + self._metadata.column_fields if inplace: self._metadata = metadata self.columns = columns else: kdf = self.copy() kdf._metadata = metadata kdf.columns = columns return kdf def isnull(self): """ Detects missing values for items in the current Dataframe. Return a boolean same-sized Dataframe indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. See Also -------- Dataframe.notnull Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) >>> df.isnull() 0 1 0 False False 1 False True 2 False True 3 False False >>> df = ks.DataFrame([[None, 'bee', None], ['dog', None, 'fly']]) >>> df.isnull() 0 1 2 0 True False True 1 False True False """ kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.isnull() return kdf isna = isnull def notnull(self): """ Detects non-missing values for items in the current Dataframe. This function takes a dataframe and indicates whether it's values are valid (not missing, which is ``NaN`` in numeric datatypes, ``None`` or ``NaN`` in objects and ``NaT`` in datetimelike). See Also -------- Dataframe.isnull Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) >>> df.notnull() 0 1 0 True True 1 True False 2 True False 3 True True >>> df = ks.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) >>> df.notnull() 0 1 2 0 True True True 1 True False True """ kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.notnull() return kdf notna = notnull def to_koalas(self): """ Converts the existing DataFrame into a Koalas DataFrame. This method is monkey-patched into Spark's DataFrame and can be used to convert a Spark DataFrame into a Koalas DataFrame. If running on an existing Koalas DataFrame, the method returns itself. If a Koalas DataFrame is converted to a Spark DataFrame and then back to Koalas, it will lose the index information and the original index will be turned into a normal column. See Also -------- DataFrame.to_spark Examples -------- >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df col1 col2 0 1 3 1 2 4 >>> spark_df = df.to_spark() >>> spark_df DataFrame[__index_level_0__: bigint, col1: bigint, col2: bigint] >>> kdf = spark_df.to_koalas() >>> kdf __index_level_0__ col1 col2 0 0 1 3 1 1 2 4 """ if isinstance(self, DataFrame): return self else: return DataFrame(self) def to_spark(self): """ Return the current DataFrame as a Spark DataFrame. See Also -------- DataFrame.to_koalas """ return self._sdf def to_pandas(self): """ Return a Pandas DataFrame. .. note:: This method should only be used if the resulting Pandas DataFrame is expected to be small, as all the data is loaded into the driver's memory. Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.to_pandas() dogs cats 0 0.2 0.3 1 0.0 0.6 2 0.6 0.0 3 0.2 0.1 """ sdf = self._sdf.select( ['`{}`'.format(name) for name in self._metadata.all_fields]) pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: # TODO: push to OSS pdf = pdf.astype({ field.name: to_arrow_type(field.dataType).to_pandas_dtype() for field in sdf.schema }) if len(self._metadata.index_info) > 0: append = False for index_field in self._metadata.index_fields: drop = index_field not in self._metadata.column_fields pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[self._metadata.column_fields] index_names = self._metadata.index_names if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] return pdf # Alias to maintain backward compatibility with Spark toPandas = to_pandas @derived_from(pd.DataFrame) def assign(self, **kwargs): from databricks.koalas.series import Series for k, v in kwargs.items(): if not (isinstance(v, (Series, spark.Column)) or callable(v) or pd.api.types.is_scalar(v)): raise TypeError("Column assignment doesn't support type " "{0}".format(type(v).__name__)) if callable(v): kwargs[k] = v(self) pairs = list(kwargs.items()) sdf = self._sdf for (name, c) in pairs: if isinstance(c, Series): sdf = sdf.withColumn(name, c._scol) elif isinstance(c, Column): sdf = sdf.withColumn(name, c) else: sdf = sdf.withColumn(name, F.lit(c)) metadata = self._metadata.copy( column_fields=(self._metadata.column_fields + [ name for name, _ in pairs if name not in self._metadata.column_fields ])) return DataFrame(sdf, metadata) @property def loc(self): return SparkDataFrameLocator(self) def copy(self): return DataFrame(self._sdf, self._metadata.copy()) @derived_from(pd.DataFrame) def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): if axis == 0 or axis == 'index': if subset is not None: if isinstance(subset, string_types): columns = [subset] else: columns = list(subset) invalids = [ column for column in columns if column not in self._metadata.column_fields ] if len(invalids) > 0: raise KeyError(invalids) else: columns = list(self.columns) cnt = reduce(lambda x, y: x + y, [ F.when(self[column].notna()._scol, 1).otherwise(0) for column in columns ], F.lit(0)) if thresh is not None: pred = cnt >= F.lit(int(thresh)) elif how == 'any': pred = cnt == F.lit(len(columns)) elif how == 'all': pred = cnt > F.lit(0) else: if how is not None: raise ValueError('invalid how option: {h}'.format(h=how)) else: raise TypeError('must specify how or thresh') sdf = self._sdf.filter(pred) if inplace: self._sdf = sdf else: return DataFrame(sdf, self._metadata.copy()) else: raise NotImplementedError( "dropna currently only works for axis=0 or axis='index'") def fillna(self, value=None, axis=None, inplace=False): """Fill NA/NaN values. :param value: scalar, dict, Series Value to use to fill holes. alternately a dict/Series of values specifying which value to use for each column. DataFrame is not supported. :param axis: {0 or `index`} 1 and `columns` are not supported. :param inplace: boolean, default False Fill in place (do not create a new object) :return: :class:`DataFrame` Examples -------- >>> df = ks.DataFrame({ ... 'A': [None, 3, None, None], ... 'B': [2, 4, None, 3], ... 'C': [None, None, None, 1], ... 'D': [0, 1, 5, 4] ... }) >>> df A B C D 0 NaN 2.0 NaN 0 1 3.0 4.0 NaN 1 2 NaN NaN NaN 5 3 NaN 3.0 1.0 4 Replace all NaN elements with 0s. >>> df.fillna(0) A B C D 0 0.0 2.0 0.0 0 1 3.0 4.0 0.0 1 2 0.0 0.0 0.0 5 3 0.0 3.0 1.0 4 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, 2, and 3 respectively. >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} >>> df.fillna(value=values) A B C D 0 0.0 2.0 2.0 0 1 3.0 4.0 2.0 1 2 0.0 1.0 2.0 5 3 0.0 3.0 1.0 4 """ if axis is None: axis = 0 if not (axis == 0 or axis == "index"): raise NotImplementedError( "fillna currently only works for axis=0 or axis='index'") if value is None: raise ValueError('Currently must specify value') if not isinstance(value, (float, int, str, bool, dict, pd.Series)): raise TypeError("Unsupported type %s" % type(value)) if isinstance(value, pd.Series): value = value.to_dict() if isinstance(value, dict): for v in value.values(): if not isinstance(v, (float, int, str, bool)): raise TypeError("Unsupported type %s" % type(v)) sdf = self._sdf.fillna(value) if inplace: self._sdf = sdf else: return DataFrame(sdf, self._metadata.copy()) def head(self, n=5): """ Return the first `n` rows. This function returns the first `n` rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it. Parameters ---------- n : int, default 5 Number of rows to select. Returns ------- obj_head : same type as caller The first `n` rows of the caller object. Examples -------- >>> df = ks.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey 5 parrot 6 shark 7 whale 8 zebra Viewing the first 5 lines >>> df.head() animal 0 alligator 1 bee 2 falcon 3 lion 4 monkey Viewing the first `n` lines (three in this case) >>> df.head(3) animal 0 alligator 1 bee 2 falcon """ return DataFrame(self._sdf.limit(n), self._metadata.copy()) @property def columns(self): """The column labels of the DataFrame.""" return pd.Index(self._metadata.column_fields) @columns.setter def columns(self, names): old_names = self._metadata.column_fields if len(old_names) != len(names): raise ValueError( "Length mismatch: Expected axis has %d elements, new values have %d elements" % (len(old_names), len(names))) sdf = self._sdf.select(self._metadata.index_fields + [ self[old_name]._scol.alias(new_name) for (old_name, new_name) in zip(old_names, names) ]) self._sdf = sdf self._metadata = self._metadata.copy(column_fields=names) @property def dtypes(self): """Return the dtypes in the DataFrame. This returns a Series with the data type of each column. The result's index is the original DataFrame's columns. Columns with mixed types are stored with the object dtype. :return: :class:`pd.Series` The data type of each column. Examples -------- >>> df = ks.DataFrame({'a': list('abc'), ... 'b': list(range(1, 4)), ... 'c': np.arange(3, 6).astype('i1'), ... 'd': np.arange(4.0, 7.0, dtype='float64'), ... 'e': [True, False, True], ... 'f': pd.date_range('20130101', periods=3)}) >>> df.dtypes a object b int64 c int8 d float64 e bool f datetime64[ns] dtype: object """ return pd.Series( [self[col].dtype for col in self._metadata.column_fields], index=self._metadata.column_fields) def count(self): """ Count non-NA cells for each column. The values `None`, `NaN` are considered NA. Returns ------- pandas.Series See Also -------- Series.count: Number of non-NA elements in a Series. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA elements. Examples -------- Constructing DataFrame from a dictionary: >>> df = ks.DataFrame({"Person": ... ["John", "Myla", "Lewis", "John", "Myla"], ... "Age": [24., np.nan, 21., 33, 26], ... "Single": [False, True, True, True, False]}) >>> df Person Age Single 0 John 24.0 False 1 Myla NaN True 2 Lewis 21.0 True 3 John 33.0 True 4 Myla 26.0 False Notice the uncounted NA values: >>> df.count() Person 5 Age 4 Single 5 dtype: int64 """ return self._reduce_for_stat_function(_Frame._count_expr) def unique(self): sdf = self._sdf return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx), self._metadata.copy()) @derived_from(pd.DataFrame) def drop(self, labels, axis=0, errors='raise'): axis = self._validate_axis(axis) if axis == 1: if isinstance(labels, list): sdf = self._sdf.drop(*labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column not in labels ]) else: sdf = self._sdf.drop(labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column != labels ]) return DataFrame(sdf, metadata) raise NotImplementedError("Drop currently only works for axis=1") @derived_from(pd.DataFrame) def get(self, key, default=None): try: return self._pd_getitem(key) except (KeyError, ValueError, IndexError): return default def sort_values(self, by, ascending=True, inplace=False, na_position='last'): """ Sort by the values along either axis. Parameters ---------- by : str or list of str ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. inplace : bool, default False if True, perform operation in-place na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end Returns ------- sorted_obj : DataFrame Examples -------- >>> df = ks.DataFrame({ ... 'col1': ['A', 'A', 'B', None, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], ... }) >>> df col1 col2 col3 0 A 2 0 1 A 1 1 2 B 9 9 3 None 8 4 4 D 7 2 5 C 4 3 Sort by col1 >>> df.sort_values(by=['col1']) col1 col2 col3 0 A 2 0 1 A 1 1 2 B 9 9 5 C 4 3 4 D 7 2 3 None 8 4 Sort by multiple columns >>> df.sort_values(by=['col1', 'col2']) col1 col2 col3 1 A 1 1 0 A 2 0 2 B 9 9 5 C 4 3 4 D 7 2 3 None 8 4 Sort Descending >>> df.sort_values(by='col1', ascending=False) col1 col2 col3 4 D 7 2 5 C 4 3 2 B 9 9 0 A 2 0 1 A 1 1 3 None 8 4 """ if isinstance(by, string_types): by = [by] if isinstance(ascending, bool): ascending = [ascending] * len(by) if len(ascending) != len(by): raise ValueError( 'Length of ascending ({}) != length of by ({})'.format( len(ascending), len(by))) if na_position not in ('first', 'last'): raise ValueError("invalid na_position: '{}'".format(na_position)) # Mapper: Get a spark column function for (ascending, na_position) combination # Note that 'asc_nulls_first' and friends were added as of Spark 2.4, see SPARK-23847. mapper = { (True, 'first'): lambda x: Column(getattr(x._jc, "asc_nulls_first")()), (True, 'last'): lambda x: Column(getattr(x._jc, "asc_nulls_last")()), (False, 'first'): lambda x: Column(getattr(x._jc, "desc_nulls_first")()), (False, 'last'): lambda x: Column(getattr(x._jc, "desc_nulls_last")()), } by = [ mapper[(asc, na_position)](self[colname]._scol) for colname, asc in zip(by, ascending) ] kdf = DataFrame(self._sdf.sort(*by), self._metadata.copy()) if inplace: self._sdf: spark.DataFrame = kdf._sdf self._metadata = kdf._metadata else: return kdf @derived_from(pd.DataFrame) def pipe(self, func, *args, **kwargs): # Taken from pandas: # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707 if isinstance(func, tuple): func, target = func if target in kwargs: raise ValueError('%s is both the pipe target and a keyword ' 'argument' % target) kwargs[target] = self return func(*args, **kwargs) else: return func(self, *args, **kwargs) @property def shape(self): """ Return a tuple representing the dimensionality of the DataFrame. Examples -------- >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) >>> df.shape (2, 2) >>> df = ks.DataFrame({'col1': [1, 2], 'col2': [3, 4], ... 'col3': [5, 6]}) >>> df.shape (2, 3) """ return len(self), len(self.columns) def _pd_getitem(self, key): from databricks.koalas.series import Series if key is None: raise KeyError("none key") if isinstance(key, string_types): try: return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) except AnalysisException: raise KeyError(key) if np.isscalar(key) or isinstance(key, (tuple, string_types)): raise NotImplementedError(key) elif isinstance(key, slice): return self.loc[key] if isinstance(key, (pd.Series, np.ndarray, pd.Index)): raise NotImplementedError(key) if isinstance(key, list): return self.loc[:, key] if isinstance(key, DataFrame): # TODO Should not implement alignment, too dangerous? return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) if isinstance(key, Series): # TODO Should not implement alignment, too dangerous? # It is assumed to be only a filter, otherwise .loc should be used. bcol = key._scol.cast("boolean") return DataFrame(self._sdf.filter(bcol), self._metadata.copy()) raise NotImplementedError(key) def __repr__(self): return repr(self.toPandas()) def __getitem__(self, key): return self._pd_getitem(key) def __setitem__(self, key, value): from databricks.koalas.series import Series # For now, we don't support realignment against different dataframes. # This is too expensive in Spark. # Are we assigning against a column? if isinstance(value, Series): assert value._kdf is self, \ "Cannot combine column argument because it comes from a different dataframe" if isinstance(key, (tuple, list)): assert isinstance(value.schema, StructType) field_names = value.schema.fieldNames() kdf = self.assign( **{k: value[c] for k, c in zip(key, field_names)}) else: kdf = self.assign(**{key: value}) self._sdf: spark.DataFrame = kdf._sdf self._metadata = kdf._metadata def __getattr__(self, key): from databricks.koalas.series import Series if key.startswith("__") or key.startswith( "_pandas_") or key.startswith("_spark_"): raise AttributeError(key) if hasattr(_MissingPandasLikeDataFrame, key): return partial(getattr(_MissingPandasLikeDataFrame, key), self) return Series(self._sdf.__getattr__(key), self, self._metadata.index_info) def __iter__(self): return self.toPandas().__iter__() def __len__(self): return self._sdf.count() def __dir__(self): fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f] return super(DataFrame, self).__dir__() + fields def _repr_html_(self): return self.head(max_display_count).toPandas()._repr_html_() @classmethod def _validate_axis(cls, axis=0): if axis not in (0, 1, 'index', 'columns', None): raise ValueError('No axis named {0}'.format(axis)) # convert to numeric axis return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
def to_dataframe(self): sdf = self._kdf._sdf.select([field for field, _ in self._index_map] + [self._scol]) metadata = Metadata(data_columns=[sdf.schema[-1].name], index_map=self._index_map) return DataFrame(sdf, metadata)
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}) >>> df = df[['A', 'B', 'C']] >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, string_types) and isinstance(value, string_types) for key, value in func_or_funcs.items()): raise ValueError("aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] reordered = [F.expr('{1}({0}) as {0}'.format(key, value)) for key, value in func_or_funcs.items()] sdf = sdf.groupby(*groupkey_cols).agg(*reordered) metadata = Metadata(column_fields=[key for key, _ in func_or_funcs.items()], index_info=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
class DataFrame(_Frame): """ Koala DataFrame that corresponds to Pandas DataFrame logically. This holds Spark DataFrame internally. :ivar _sdf: Spark Column instance :ivar _metadata: Metadata related to column names and index information. """ @derived_from(pd.DataFrame) @dispatch_on('data') def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy) self._init_from_pandas(pdf) @__init__.register(pd.DataFrame) def _init_from_pandas(self, pdf, *args): metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields self._init_from_spark(default_session().createDataFrame(reset_index), metadata) @__init__.register(spark.DataFrame) def _init_from_spark(self, sdf, metadata=None, *args): self._sdf = sdf if metadata is None: self._metadata = Metadata( column_fields=self._sdf.schema.fieldNames()) else: self._metadata = metadata @property def _index_columns(self): return [ self._sdf.__getitem__(field) for field in self._metadata.index_fields ] def _reduce_for_stat_function(self, sfun): sdf = self._sdf.select( [sfun(self._sdf[col]).alias(col) for col in self.columns]) pdf = sdf.toPandas() assert len(pdf) == 1, (sdf, pdf) row = pdf.iloc[0] row.name = None return row # Return first row as a Series @derived_from(pd.DataFrame) def iteritems(self): cols = list(self.columns) return list((col_name, self[col_name]) for col_name in cols) @derived_from(pd.DataFrame) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, max_cols=None, show_dimensions=False, decimal='.', bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False): return self.toPandas().to_html(buf=buf, columns=columns, col_space=col_space, header=header, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook, border=border, table_id=table_id, render_links=render_links) @property def index(self): """The index (row labels) Column of the DataFrame. Currently supported only when the DataFrame has a single index. """ from databricks.koalas.series import Series if len(self._metadata.index_info) != 1: raise KeyError( 'Currently supported only when the DataFrame has a single index.' ) return Series(self._index_columns[0], self, []) def set_index(self, keys, drop=True, append=False, inplace=False): """Set the DataFrame index (row labels) using one or more existing columns. By default yields a new object. :param keys: column label or list of column labels / arrays :param drop: boolean, default True Delete columns to be used as the new index :param append: boolean, default False Whether to append columns to existing index :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if isinstance(keys, string_types): keys = [keys] else: keys = list(keys) for key in keys: if key not in self.columns: raise KeyError(key) if drop: columns = [ column for column in self._metadata.column_fields if column not in keys ] else: columns = self._metadata.column_fields if append: index_info = self._metadata.index_info + [(column, column) for column in keys] else: index_info = [(column, column) for column in keys] metadata = self._metadata.copy(column_fields=columns, index_info=index_info) if inplace: self._metadata = metadata else: kdf = self.copy() kdf._metadata = metadata return kdf def reset_index(self, level=None, drop=False, inplace=False): """For DataFrame with multi-level index, return new DataFrame with labeling information in the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None. For a standard index, the index name will be used (if set), otherwise a default 'index' or 'level_0' (if 'index' is already taken) will be used. :param level: int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by default :param drop: boolean, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. :param inplace: boolean, default False Modify the DataFrame in place (do not create a new object) :return: :class:`DataFrame` """ if len(self._metadata.index_info) == 0: raise NotImplementedError( 'Can\'t reset index because there is no index.') multi_index = len(self._metadata.index_info) > 1 if multi_index: rename = lambda i: 'level_{}'.format(i) else: rename = lambda i: \ 'index' if 'index' not in self._metadata.column_fields else 'level_{}'.fomat(i) if level is None: index_columns = [ (column, name if name is not None else rename(i)) for i, (column, name) in enumerate(self._metadata.index_info) ] index_info = [] else: if isinstance(level, (int, string_types)): level = [level] level = list(level) if all(isinstance(l, int) for l in level): for l in level: if l >= len(self._metadata.index_info): raise IndexError( 'Too many levels: Index has only {} level, not {}'. format(len(self._metadata.index_info), l + 1)) idx = level elif all(isinstance(l, string_types) for l in level): idx = [] for l in level: try: i = self._metadata.index_fields.index(l) idx.append(i) except ValueError: if multi_index: raise KeyError('Level unknown not found') else: raise KeyError( 'Level unknown must be same as name ({})'. format(self._metadata.index_fields[0])) else: raise ValueError('Level should be all int or all string.') idx.sort() index_columns = [] index_info = self._metadata.index_info.copy() for i in idx: info = self._metadata.index_info[i] column_field, index_name = info index_columns.append((column_field, index_name if index_name is not None else rename(index_name))) index_info.remove(info) if drop: index_columns = [] metadata = self._metadata.copy( column_fields=[column for column, _ in index_columns] + self._metadata.column_fields, index_info=index_info) columns = [name for _, name in index_columns] + self._metadata.column_fields if inplace: self._metadata = metadata self.columns = columns else: kdf = self.copy() kdf._metadata = metadata kdf.columns = columns return kdf @derived_from(pd.DataFrame) def isnull(self): kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.isnull() return kdf isna = isnull @derived_from(pd.DataFrame) def notnull(self): kdf = self.copy() for name, ks in kdf.iteritems(): kdf[name] = ks.notnull() return kdf notna = notnull @derived_from(spark.DataFrame) def toPandas(self): sdf = self._sdf.select( ['`{}`'.format(name) for name in self._metadata.all_fields]) pdf = sdf.toPandas() if len(pdf) == 0 and len(sdf.schema) > 0: # TODO: push to OSS pdf = pdf.astype({ field.name: to_arrow_type(field.dataType).to_pandas_dtype() for field in sdf.schema }) if len(self._metadata.index_info) > 0: append = False for index_field in self._metadata.index_fields: drop = index_field not in self._metadata.column_fields pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[self._metadata.column_fields] index_names = self._metadata.index_names if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] return pdf @derived_from(pd.DataFrame) def assign(self, **kwargs): from databricks.koalas.series import Series for k, v in kwargs.items(): if not (isinstance(v, (Series, spark.Column)) or callable(v) or pd.api.types.is_scalar(v)): raise TypeError("Column assignment doesn't support type " "{0}".format(type(v).__name__)) if callable(v): kwargs[k] = v(self) pairs = list(kwargs.items()) sdf = self._sdf for (name, c) in pairs: if isinstance(c, Series): sdf = sdf.withColumn(name, c._scol) else: sdf = sdf.withColumn(name, c) metadata = self._metadata.copy( column_fields=(self._metadata.column_fields + [ name for name, _ in pairs if name not in self._metadata.column_fields ])) return DataFrame(sdf, metadata) @property def loc(self): return SparkDataFrameLocator(self) def copy(self): return DataFrame(self._sdf, self._metadata.copy()) @derived_from(pd.DataFrame) def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): if axis == 0 or axis == 'index': if subset is not None: if isinstance(subset, string_types): columns = [subset] else: columns = list(subset) invalids = [ column for column in columns if column not in self._metadata.column_fields ] if len(invalids) > 0: raise KeyError(invalids) else: columns = list(self.columns) cnt = reduce(lambda x, y: x + y, [ F.when(self[column].notna()._scol, 1).otherwise(0) for column in columns ], F.lit(0)) if thresh is not None: pred = cnt >= F.lit(int(thresh)) elif how == 'any': pred = cnt == F.lit(len(columns)) elif how == 'all': pred = cnt > F.lit(0) else: if how is not None: raise ValueError('invalid how option: {h}'.format(h=how)) else: raise TypeError('must specify how or thresh') sdf = self._sdf.filter(pred) if inplace: self._sdf = sdf else: return DataFrame(sdf, self._metadata.copy()) else: raise NotImplementedError( "dropna currently only works for axis=0 or axis='index'") def head(self, n=5): return DataFrame(self._sdf.limit(n), self._metadata.copy()) @property def columns(self): return pd.Index(self._metadata.column_fields) @columns.setter def columns(self, names): old_names = self._metadata.column_fields if len(old_names) != len(names): raise ValueError( "Length mismatch: Expected axis has %d elements, new values have %d elements" % (len(old_names), len(names))) sdf = self._sdf.select(self._metadata.index_fields + [ self[old_name]._scol.alias(new_name) for (old_name, new_name) in zip(old_names, names) ]) self._sdf = sdf self._metadata = self._metadata.copy(column_fields=names) @derived_from(pd.DataFrame, ua_args=['axis', 'level', 'numeric_only']) def count(self): return self._sdf.count() def unique(self): sdf = self._sdf return DataFrame(spark.DataFrame(sdf._jdf.distinct(), sdf.sql_ctx), self._metadata.copy()) @derived_from(pd.DataFrame) def drop(self, labels, axis=0, errors='raise'): axis = self._validate_axis(axis) if axis == 1: if isinstance(labels, list): sdf = self._sdf.drop(*labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column not in labels ]) else: sdf = self._sdf.drop(labels) metadata = self._metadata.copy(column_fields=[ column for column in self._metadata.column_fields if column != labels ]) return DataFrame(sdf, metadata) raise NotImplementedError("Drop currently only works for axis=1") @derived_from(pd.DataFrame) def get(self, key, default=None): try: return self._pd_getitem(key) except (KeyError, ValueError, IndexError): return default def sort_values(self, by): return DataFrame(self._sdf.sort(by), self._metadata.copy()) def groupby(self, by): from databricks.koalas.groups import PandasLikeGroupBy gp = self._sdf.groupby(by) return PandasLikeGroupBy(self, gp, None) @derived_from(pd.DataFrame) def pipe(self, func, *args, **kwargs): # Taken from pandas: # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L2698-L2707 if isinstance(func, tuple): func, target = func if target in kwargs: raise ValueError('%s is both the pipe target and a keyword ' 'argument' % target) kwargs[target] = self return func(*args, **kwargs) else: return func(self, *args, **kwargs) @property def shape(self): return len(self), len(self.columns) def _pd_getitem(self, key): from databricks.koalas.series import Series if key is None: raise KeyError("none key") if isinstance(key, string_types): try: return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) except AnalysisException: raise KeyError(key) if np.isscalar(key) or isinstance(key, (tuple, string_types)): raise NotImplementedError(key) elif isinstance(key, slice): return self.loc[key] if isinstance(key, (pd.Series, np.ndarray, pd.Index)): raise NotImplementedError(key) if isinstance(key, list): return self.loc[:, key] if isinstance(key, DataFrame): # TODO Should not implement alignment, too dangerous? return Series(self._sdf.__getitem__(key), self, self._metadata.index_info) if isinstance(key, Series): # TODO Should not implement alignment, too dangerous? # It is assumed to be only a filter, otherwise .loc should be used. bcol = key._scol.cast("boolean") return DataFrame(self._sdf.filter(bcol), self._metadata.copy()) raise NotImplementedError(key) def __getitem__(self, key): return self._pd_getitem(key) def __setitem__(self, key, value): from databricks.koalas.series import Series # For now, we don't support realignment against different dataframes. # This is too expensive in Spark. # Are we assigning against a column? if isinstance(value, Series): assert value._kdf is self, \ "Cannot combine column argument because it comes from a different dataframe" if isinstance(key, (tuple, list)): assert isinstance(value.schema, StructType) field_names = value.schema.fieldNames() kdf = self.assign( **{k: value[c] for k, c in zip(key, field_names)}) else: kdf = self.assign(**{key: value}) self._sdf = kdf._sdf self._metadata = kdf._metadata def __getattr__(self, key): from databricks.koalas.series import Series if key.startswith("__") or key.startswith( "_pandas_") or key.startswith("_spark_"): raise AttributeError(key) if hasattr(_MissingPandasLikeDataFrame, key): return partial(getattr(_MissingPandasLikeDataFrame, key), self) return Series(self._sdf.__getattr__(key), self, self._metadata.index_info) def __iter__(self): return self.toPandas().__iter__() def __len__(self): return self._sdf.count() def __dir__(self): fields = [f for f in self._sdf.schema.fieldNames() if ' ' not in f] return super(DataFrame, self).__dir__() + fields def _repr_html_(self): return self.head(max_display_count).toPandas()._repr_html_() @classmethod def _validate_axis(cls, axis=0): if axis not in (0, 1, 'index', 'columns', None): raise ValueError('No axis named {0}'.format(axis)) # convert to numeric axis return {None: 0, 'index': 0, 'columns': 1}.get(axis, axis)
def _metadata(self): if not hasattr(self, '_pandas_metadata') or self._pandas_metadata is None: self._pandas_metadata = Metadata( column_fields=self.schema.fieldNames()) return self._pandas_metadata