Example #1
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            df_dropna = self.to_dataframe()._spark_filter(self.notna())
        else:
            df_dropna = self.to_dataframe()
        df = df_dropna._spark_groupby(self).count()
        if sort:
            if ascending:
                df = df._spark_orderBy(F._spark_col('count'))
            else:
                df = df._spark_orderBy(F._spark_col('count')._spark_desc())

        if normalize:
            sum = df_dropna._spark_count()
            df = df._spark_withColumn(
                'count',
                F._spark_col('count') / F._spark_lit(sum))

        return _col(df.set_index([self.name]))
Example #2
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            df_dropna = self._pandas_anchor._spark_filter(self.notna())
        else:
            df_dropna = self._pandas_anchor
        df = df_dropna._spark_groupby(self).count()
        if sort:
            if ascending:
                df = df._spark_orderBy(F._spark_col('count'))
            else:
                df = df._spark_orderBy(F._spark_col('count')._spark_desc())

        if normalize:
            sum = df_dropna._spark_count()
            df = df._spark_withColumn(
                'count',
                F._spark_col('count') / F._spark_lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        df.columns = [index_name, self.name]
        df._metadata = Metadata(column_fields=[self.name],
                                index_info=[(index_name, None)])
        return _col(df)
Example #3
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if (not isinstance(key, tuple)) or (len(key) != 2):
            raise NotImplementedError("Only accepts pairs of candidates")

        rows_sel, cols_sel = key

        if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
            raise SparkPandasNotImplementedError(
                description=
                """Can only assign value to the whole dataframe, the row index
                has to be `slice(None)` or `:`""",
                pandas_function=".loc[..., ...] = ...",
                spark_target_function="withColumn, select")

        if not isinstance(cols_sel, str):
            raise ValueError("""only column names can be assigned""")

        if isinstance(value, Series):
            self._kdf[cols_sel] = value
        elif isinstance(value, DataFrame) and len(value.columns) == 1:
            from pyspark.sql.functions import _spark_col
            self._kdf[cols_sel] = _spark_col(value.columns[0])
        elif isinstance(value, DataFrame) and len(value.columns) != 1:
            raise ValueError(
                "Only a dataframe with one column can be assigned")
        else:
            raise ValueError(
                "Only a column or dataframe with single column can be assigned"
            )
Example #4
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if (not isinstance(key, tuple)) or (len(key) != 2):
            raise NotImplementedError("Only accepts pairs of candidates")

        rows_sel, cols_sel = key

        if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
            raise SparkPandasNotImplementedError(
                description="""Can only assign value to the whole dataframe, the row index
                has to be `slice(None)` or `:`""",
                pandas_function=".loc[..., ...] = ...",
                spark_target_function="withColumn, select")

        if not isinstance(cols_sel, str):
            raise ValueError("""only column names can be assigned""")

        if isinstance(value, Series):
            self._kdf[cols_sel] = value
        elif isinstance(value, DataFrame) and len(value.columns) == 1:
            from pyspark.sql.functions import _spark_col
            self._kdf[cols_sel] = _spark_col(value.columns[0])
        elif isinstance(value, DataFrame) and len(value.columns) != 1:
            raise ValueError("Only a dataframe with one column can be assigned")
        else:
            raise ValueError("Only a column or dataframe with single column can be assigned")
Example #5
0
def _make_col(c):
    if isinstance(c, Column):
        return c
    elif isinstance(c, str):
        from pyspark.sql.functions import _spark_col
        return _spark_col(c)
    else:
        raise SparkPandasNotImplementedError(
            description="Can only convert a string to a column type.")
Example #6
0
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False,
                drop_first=False, dtype=None):
    if sparse is not False:
        raise NotImplementedError("get_dummies currently does not support sparse")

    if isinstance(columns, string_types):
        columns = [columns]
    if dtype is None:
        dtype = 'byte'

    if isinstance(data, Column):
        if prefix is not None:
            prefix = [str(prefix)]
        columns = [data.name]
        df = data.to_dataframe()
        remaining_columns = []
    else:
        if isinstance(prefix, string_types):
            raise ValueError("get_dummies currently does not support prefix as string types")
        df = data.copy()
        if columns is None:
            columns = [column for column in df.columns
                       if isinstance(data.schema[column].dataType,
                                     _get_dummies_default_accept_types)]
        if len(columns) == 0:
            return df

        if prefix is None:
            prefix = columns

        column_set = set(columns)
        remaining_columns = [df[column] for column in df.columns if column not in column_set]

    if any(not isinstance(data.schema[column].dataType, _get_dummies_acceptable_types)
           for column in columns):
        raise ValueError("get_dummies currently only accept {} values"
                         .format(', '.join([t.typeName() for t in _get_dummies_acceptable_types])))

    if prefix is not None and len(columns) != len(prefix):
        raise ValueError(
            "Length of 'prefix' ({}) did not match the length of the columns being encoded ({})."
            .format(len(prefix), len(columns)))

    all_values = _reduce_spark_multi(df, [F._spark_collect_set(F._spark_col(column))
                                          ._spark_alias(column)
                                          for column in columns])
    for i, column in enumerate(columns):
        values = sorted(all_values[i])
        if drop_first:
            values = values[1:]

        def column_name(value):
            if prefix is None:
                return str(value)
            else:
                return '{}{}{}'.format(prefix[i], prefix_sep, value)

        for value in values:
            remaining_columns.append((df[column].notnull() & (df[column] == value))
                                     .astype(dtype)
                                     .alias(column_name(value)))
        if dummy_na:
            remaining_columns.append(df[column].isnull().astype(dtype).alias(column_name('nan')))

    return df[remaining_columns]