def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: df_dropna = self.to_dataframe()._spark_filter(self.notna()) else: df_dropna = self.to_dataframe() df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn( 'count', F._spark_col('count') / F._spark_lit(sum)) return _col(df.set_index([self.name]))
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: df_dropna = self._pandas_anchor._spark_filter(self.notna()) else: df_dropna = self._pandas_anchor df = df_dropna._spark_groupby(self).count() if sort: if ascending: df = df._spark_orderBy(F._spark_col('count')) else: df = df._spark_orderBy(F._spark_col('count')._spark_desc()) if normalize: sum = df_dropna._spark_count() df = df._spark_withColumn( 'count', F._spark_col('count') / F._spark_lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' df.columns = [index_name, self.name] df._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(df)
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if (not isinstance(key, tuple)) or (len(key) != 2): raise NotImplementedError("Only accepts pairs of candidates") rows_sel, cols_sel = key if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)): raise SparkPandasNotImplementedError( description= """Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") if not isinstance(cols_sel, str): raise ValueError("""only column names can be assigned""") if isinstance(value, Series): self._kdf[cols_sel] = value elif isinstance(value, DataFrame) and len(value.columns) == 1: from pyspark.sql.functions import _spark_col self._kdf[cols_sel] = _spark_col(value.columns[0]) elif isinstance(value, DataFrame) and len(value.columns) != 1: raise ValueError( "Only a dataframe with one column can be assigned") else: raise ValueError( "Only a column or dataframe with single column can be assigned" )
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if (not isinstance(key, tuple)) or (len(key) != 2): raise NotImplementedError("Only accepts pairs of candidates") rows_sel, cols_sel = key if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)): raise SparkPandasNotImplementedError( description="""Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") if not isinstance(cols_sel, str): raise ValueError("""only column names can be assigned""") if isinstance(value, Series): self._kdf[cols_sel] = value elif isinstance(value, DataFrame) and len(value.columns) == 1: from pyspark.sql.functions import _spark_col self._kdf[cols_sel] = _spark_col(value.columns[0]) elif isinstance(value, DataFrame) and len(value.columns) != 1: raise ValueError("Only a dataframe with one column can be assigned") else: raise ValueError("Only a column or dataframe with single column can be assigned")
def _make_col(c): if isinstance(c, Column): return c elif isinstance(c, str): from pyspark.sql.functions import _spark_col return _spark_col(c) else: raise SparkPandasNotImplementedError( description="Can only convert a string to a column type.")
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None): if sparse is not False: raise NotImplementedError("get_dummies currently does not support sparse") if isinstance(columns, string_types): columns = [columns] if dtype is None: dtype = 'byte' if isinstance(data, Column): if prefix is not None: prefix = [str(prefix)] columns = [data.name] df = data.to_dataframe() remaining_columns = [] else: if isinstance(prefix, string_types): raise ValueError("get_dummies currently does not support prefix as string types") df = data.copy() if columns is None: columns = [column for column in df.columns if isinstance(data.schema[column].dataType, _get_dummies_default_accept_types)] if len(columns) == 0: return df if prefix is None: prefix = columns column_set = set(columns) remaining_columns = [df[column] for column in df.columns if column not in column_set] if any(not isinstance(data.schema[column].dataType, _get_dummies_acceptable_types) for column in columns): raise ValueError("get_dummies currently only accept {} values" .format(', '.join([t.typeName() for t in _get_dummies_acceptable_types]))) if prefix is not None and len(columns) != len(prefix): raise ValueError( "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})." .format(len(prefix), len(columns))) all_values = _reduce_spark_multi(df, [F._spark_collect_set(F._spark_col(column)) ._spark_alias(column) for column in columns]) for i, column in enumerate(columns): values = sorted(all_values[i]) if drop_first: values = values[1:] def column_name(value): if prefix is None: return str(value) else: return '{}{}{}'.format(prefix[i], prefix_sep, value) for value in values: remaining_columns.append((df[column].notnull() & (df[column] == value)) .astype(dtype) .alias(column_name(value))) if dummy_na: remaining_columns.append(df[column].isnull().astype(dtype).alias(column_name('nan'))) return df[remaining_columns]