Ejemplo n.º 1
0
def filter_array_like(array_like,
                      op,
                      value,
                      mask=None,
                      out=None,
                      strict_date_types=False):
    """
    Filter an array-like object using operations defined in the predicates

    Parameters
    ----------
    array_like: array-like, c.f. pd.api.types.is_array_like
        The array like object to be filtered
    op: string
    value: object
    mask: boolean array-like, optional
        A boolean array like object which will be combined with the result
        of this evaluation using a logical AND. If an array with all True is
        given, it will be the same result as if left empty
    out: array-like
        An array into which the result is stored. If provided, it must have a shape
        that the inputs broadcast to. If not provided or None, a freshly-allocated
        array is returned.
    strict_date_types: bool
        If False (default), cast all datelike values to datetime64 for comparison.
    """
    if mask is None:
        mask = np.ones(len(array_like), dtype=bool)

    if out is None:
        out = np.empty(len(array_like), dtype=bool)

    # datetime is a subclass of date which is why we need this double check
    date_type_to_cast = datetime.datetime if strict_date_types else datetime.date
    if isinstance(value, date_type_to_cast):
        value = pd.Timestamp(value).to_datetime64()
    elif is_list_like(value) and len(value) and isinstance(
            value[0], date_type_to_cast):
        value = [pd.Timestamp(val).to_datetime64() for val in value]

    with np.errstate(invalid="ignore"):
        if op == "==":
            np.logical_and(array_like == value, mask, out=out)
        elif op == "!=":
            np.logical_and(array_like != value, mask, out=out)
        elif op == "<=":
            np.logical_and(array_like <= value, mask, out=out)
        elif op == ">=":
            np.logical_and(array_like >= value, mask, out=out)
        elif op == "<":
            np.logical_and(array_like < value, mask, out=out)
        elif op == ">":
            np.logical_and(array_like > value, mask, out=out)
        elif op == "in":
            value = np.asarray(value)
            np.logical_and(
                np.isin(array_like, value)
                if len(value) > 0 else np.zeros(len(array_like), dtype=bool),
                mask,
                out=out,
            )
        else:
            raise NotImplementedError("op not supported")

    return out
Ejemplo n.º 2
0
    def compute_aesthetics(self, plot):
        """
        Return a dataframe where the columns match the
        aesthetic mappings.

        Transformations like 'factor(cyl)' and other
        expression evaluation are  made in here
        """
        data = self.data
        aesthetics = self.layer_mapping(plot.mapping)

        # Override grouping if set in layer.
        with suppress(KeyError):
            aesthetics['group'] = self.geom.aes_params['group']

        env = EvalEnvironment.capture(eval_env=plot.environment)
        env = env.with_outer_namespace({'factor': pd.Categorical})

        # Using `type` preserves the subclass of pd.DataFrame
        evaled = type(data)(index=data.index)

        # If a column name is not in the data, it is evaluated/transformed
        # in the environment of the call to ggplot
        for ae, col in aesthetics.items():
            if isinstance(col, six.string_types):
                if col in data:
                    evaled[ae] = data[col]
                else:
                    try:
                        new_val = env.eval(col, inner_namespace=data)
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_EVAL_FAIL.format(ae, col, str(e)))

                    try:
                        evaled[ae] = new_val
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_BAD_EVAL_TYPE.format(
                                ae, col, str(type(new_val)), str(e)))
            elif pdtypes.is_list_like(col):
                n = len(col)
                if len(data) and n != len(data) and n != 1:
                    raise PlotnineError(
                        "Aesthetics must either be length one, " +
                        "or the same length as the data")
                # An empty dataframe does not admit a scalar value
                elif len(evaled) and n == 1:
                    col = col[0]
                evaled[ae] = col
            elif is_known_scalar(col):
                if not len(evaled):
                    col = [col]
                evaled[ae] = col
            else:
                msg = "Do not know how to deal with aesthetic '{}'"
                raise PlotnineError(msg.format(ae))

        evaled_aes = aes(**dict((col, col) for col in evaled))
        plot.scales.add_defaults(evaled, evaled_aes)

        if len(data) == 0 and len(evaled) > 0:
            # No data, and vectors suppled to aesthetics
            evaled['PANEL'] = 1
        else:
            evaled['PANEL'] = data['PANEL']

        self.data = add_group(evaled)
Ejemplo n.º 3
0
    def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
            align='left', vmin=None, vmax=None):
        """
        Draw bar chart in the cell backgrounds.

        Parameters
        ----------
        subset : IndexSlice, optional
            A valid slice for `data` to limit the style application to.
        axis : int, str or None, default 0
            Apply to each column (`axis=0` or `'index'`)
            or to each row (`axis=1` or `'columns'`) or
            to the entire DataFrame at once with `axis=None`.
        color : str or 2-tuple/list
            If a str is passed, the color is the same for both
            negative and positive numbers. If 2-tuple/list is used, the
            first element is the color_negative and the second is the
            color_positive (eg: ['#d65f5f', '#5fba7d']).
        width : float, default 100
            A number between 0 or 100. The largest value will cover `width`
            percent of the cell's width.
        align : {'left', 'zero',' mid'}, default 'left'
            How to align the bars with the cells.

            - 'left' : the min value starts at the left of the cell.
            - 'zero' : a value of zero is located at the center of the cell.
            - 'mid' : the center of the cell is at (max-min)/2, or
              if values are all negative (positive) the zero is aligned
              at the right (left) of the cell.

              .. versionadded:: 0.20.0

        vmin : float, optional
            Minimum bar value, defining the left hand limit
            of the bar drawing range, lower values are clipped to `vmin`.
            When None (default): the minimum value of the data will be used.

            .. versionadded:: 0.24.0

        vmax : float, optional
            Maximum bar value, defining the right hand limit
            of the bar drawing range, higher values are clipped to `vmax`.
            When None (default): the maximum value of the data will be used.

            .. versionadded:: 0.24.0

        Returns
        -------
        self : Styler
        """
        if align not in ('left', 'zero', 'mid'):
            raise ValueError("`align` must be one of {'left', 'zero',' mid'}")

        if not (is_list_like(color)):
            color = [color, color]
        elif len(color) == 1:
            color = [color[0], color[0]]
        elif len(color) > 2:
            raise ValueError("`color` must be string or a list-like"
                             " of length 2: [`color_neg`, `color_pos`]"
                             " (eg: color=['#d65f5f', '#5fba7d'])")

        subset = _maybe_numeric_slice(self.data, subset)
        subset = _non_reducing_slice(subset)
        self.apply(self._bar, subset=subset, axis=axis,
                   align=align, colors=color, width=width,
                   vmin=vmin, vmax=vmax)

        return self
Ejemplo n.º 4
0
def qplot(x=None,
          y=None,
          data=None,
          facets=None,
          margins=False,
          geom='auto',
          xlim=None,
          ylim=None,
          log='',
          main=None,
          xlab=None,
          ylab=None,
          asp=None,
          **kwargs):
    """
    Quick plot

    Parameters
    ----------
    x: str | array_like
        x aesthetic
    y: str | array_like
        y aesthetic
    data: pandas.DataFrame
        Data frame to use (optional). If not specified,
        will create one, extracting arrays from the
        current environment.
    geom: str | list
        *geom(s)* to do the drawing. If ``auto``, defaults
        to 'point' if ``x`` and ``y`` are specified or
        'histogram' if only ``x`` is specified.
    xlim: tuple
        x-axis limits
    ylim: tuple
        y-axis limits
    log: 'x' | 'y' | 'xy'
        Which variables to log transform.
    main: str
        Plot title
    xlab: str
        x-axis label
    ylab: str
        y-axis label
    asp: str | float
        The y/x aspect ratio.
    kwargs: dict
        Arguments passed on to the geom.

    Returns
    -------
    p: ggplot
        ggplot object
    """
    # Extract all recognizable aesthetic mappings from the parameters
    # String values e.g  "I('red')", "I(4)" are not treated as mappings

    environment = EvalEnvironment.capture(1)
    aesthetics = {} if x is None else {'x': x}
    if y is not None:
        aesthetics['y'] = y

    def is_mapping(value):
        """
        Return True if value is not enclosed in I() function
        """
        with suppress(AttributeError):
            return not (value.startswith('I(') and value.endswith(')'))
        return True

    def I(value):
        return value

    I_env = EvalEnvironment([{'I': I}])

    for ae in six.viewkeys(kwargs) & all_aesthetics:
        value = kwargs[ae]
        if is_mapping(value):
            aesthetics[ae] = value
        else:
            kwargs[ae] = I_env.eval(value)

    # List of geoms
    if is_string(geom):
        geom = [geom]
    elif isinstance(geom, tuple):
        geom = list(geom)

    if data is None:
        data = pd.DataFrame()

    # Work out plot data, and modify aesthetics, if necessary
    def replace_auto(lst, str2):
        """
        Replace all occurences of 'auto' in with str2
        """
        for i, value in enumerate(lst):
            if value == 'auto':
                lst[i] = str2
        return lst

    if 'auto' in geom:
        if 'sample' in aesthetics:
            replace_auto(geom, 'qq')
        elif y is None:
            # If x is discrete we choose geom_bar &
            # geom_histogram otherwise. But we need to
            # evaluate the mapping to find out the dtype
            env = environment.with_outer_namespace({'factor': pd.Categorical})

            if isinstance(aesthetics['x'], six.string_types):
                try:
                    x = env.eval(aesthetics['x'], inner_namespace=data)
                except Exception:
                    msg = "Could not evaluate aesthetic 'x={}'"
                    raise PlotnineError(msg.format(aesthetics['x']))
            elif not hasattr(aesthetics['x'], 'dtype'):
                x = np.asarray(aesthetics['x'])

            if array_kind.discrete(x):
                replace_auto(geom, 'bar')
            else:
                replace_auto(geom, 'histogram')

        else:
            if x is None:
                if pdtypes.is_list_like(aesthetics['y']):
                    aesthetics['x'] = range(len(aesthetics['y']))
                    xlab = 'range(len(y))'
                    ylab = 'y'
                else:
                    # We could solve the issue in layer.compute_asthetics
                    # but it is not worth the extra complexity
                    raise PlotnineError("Cannot infer how long x should be.")
            replace_auto(geom, 'point')

    p = ggplot(aes(**aesthetics), data=data, environment=environment)

    def get_facet_type(facets):
        with suppress(PlotnineError):
            parse_grid_facets(facets)
            return 'grid'

        with suppress(PlotnineError):
            parse_wrap_facets(facets)
            return 'wrap'

        warn("Could not determine the type of faceting, "
             "therefore no faceting.")
        return 'null'

    if facets:
        facet_type = get_facet_type(facets)
        if facet_type == 'grid':
            p += facet_grid(facets, margins=margins)
        elif facet_type == 'wrap':
            p += facet_wrap(facets)
        else:
            p += facet_null()

    # Add geoms
    for g in geom:
        geom_name = 'geom_{}'.format(g)
        geom_klass = Registry[geom_name]
        stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat'])
        stat_klass = Registry[stat_name]
        # find params
        recognized = (
            six.viewkeys(kwargs) &
            (six.viewkeys(geom_klass.DEFAULT_PARAMS) | geom_klass.aesthetics()
             | six.viewkeys(stat_klass.DEFAULT_PARAMS)
             | stat_klass.aesthetics()))
        recognized = recognized - six.viewkeys(aesthetics)
        params = {ae: kwargs[ae] for ae in recognized}
        p += geom_klass(**params)

    # pd.Series objects have name attributes. In a dataframe, the
    # series have the name of the column.
    labels = {}
    for ae in scaled_aesthetics & six.viewkeys(kwargs):
        with suppress(AttributeError):
            labels[ae] = kwargs[ae].name

    with suppress(AttributeError):
        labels['x'] = xlab if xlab is not None else x.name

    with suppress(AttributeError):
        labels['y'] = ylab if ylab is not None else y.name

    if 'x' in log:
        p += scale_x_log10()

    if 'y' in log:
        p += scale_y_log10()

    if labels:
        p += labs(**labels)

    if main:
        p += ggtitle(main)

    if asp:
        p += theme(aspect_ratio=asp)

    return p
Ejemplo n.º 5
0
def is_2d(x):
    return is_list_like(x) or is_slice(x)
Ejemplo n.º 6
0
    def reorder_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Reorder categories as specified in new_categories.

        `new_categories` need to include all old categories and no new category
        items.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, optional
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        inplace : bool, default False
           Whether or not to reorder the categories inplace or return a copy of
           this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        cat : Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the new categories do not contain all old category items or any
            new ones

        See Also
        --------
        rename_categories : Rename categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.reorder_categories(['c', 'b', 'a'], ordered=True)  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['c' < 'b' < 'a']
        """
        if inplace:
            warnings.warn(
                "The `inplace` parameter in reorder_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".
                format(new_categories))
        elif len(set(new_categories)) != len(set(self.categories)) or any(
                cat not in self.categories for cat in new_categories):
            raise ValueError(
                "items in new_categories are not the same as in old categories"
            )

        if ordered is None:
            ordered = self.ordered

        if new_categories == list(self.categories) and ordered == self.ordered:
            if inplace:
                return None
            else:
                return self._data.copy()
        else:
            dtype = CategoricalDtype(categories=new_categories,
                                     ordered=ordered)
            psser = _to_cat(self._data).astype(dtype)

            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
Ejemplo n.º 7
0
    def compute_aesthetics(self, plot):
        """
        Return a dataframe where the columns match the
        aesthetic mappings.

        Transformations like 'factor(cyl)' and other
        expression evaluation are  made in here
        """
        data = self.data
        aesthetics = self.layer_mapping(plot.mapping)

        env = EvalEnvironment.capture(eval_env=plot.environment)
        env = env.with_outer_namespace(AES_INNER_NAMESPACE)

        # Using `type` preserves the subclass of pd.DataFrame
        evaled = type(data)(index=data.index)

        # Override grouping if set in layer.
        if 'group' in self.geom.aes_params:
            evaled['group'] = self.geom.aes_params['group']
            if 'group' in aesthetics:
                del aesthetics['group']

        # If a column name is not in the data, it is evaluated/transformed
        # in the environment of the call to ggplot
        for ae, col in aesthetics.items():
            if isinstance(col, str):
                if col in data:
                    evaled[ae] = data[col]
                else:
                    try:
                        new_val = env.eval(col, inner_namespace=data)
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_EVAL_FAIL.format(ae, col, str(e)))

                    try:
                        evaled[ae] = new_val
                    except Exception as e:
                        raise PlotnineError(
                            _TPL_BAD_EVAL_TYPE.format(ae, col,
                                                      str(type(new_val)),
                                                      str(e)))
            elif pdtypes.is_list_like(col):
                n = len(col)
                if len(data) and n != len(data) and n != 1:
                    raise PlotnineError(
                        "Aesthetics must either be length one, " +
                        "or the same length as the data")
                # An empty dataframe does not admit a scalar value
                elif len(evaled) and n == 1:
                    col = col[0]
                evaled[ae] = col
            elif is_known_scalar(col):
                if not len(evaled):
                    col = [col]
                evaled[ae] = col
            else:
                msg = "Do not know how to deal with aesthetic '{}'"
                raise PlotnineError(msg.format(ae))

        evaled_aes = aes(**dict((col, col) for col in evaled))
        plot.scales.add_defaults(evaled, evaled_aes)

        if len(data) == 0 and len(evaled) > 0:
            # No data, and vectors suppled to aesthetics
            evaled['PANEL'] = 1
        else:
            evaled['PANEL'] = data['PANEL']

        self.data = add_group(evaled)
Ejemplo n.º 8
0
def verify_column_type(df: pd.DataFrame) -> None:
    """Verify that the columns have the right type."""
    if not is_string_dtype(df['text']):
        raise ValueError('The "text" column should be of string type')
    elif not is_list_like(df['labels']):
        raise ValueError('The "labels" column should be of list type')
Ejemplo n.º 9
0
    def __init__(self, sdf: spark.DataFrame,
                 index_map: Optional[List[IndexMap]] = None,
                 scol: Optional[spark.Column] = None,
                 data_columns: Optional[List[str]] = None,
                 column_index: Optional[List[Tuple[str, ...]]] = None,
                 column_index_names: Optional[List[str]] = None) -> None:
        """
        Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and
        index fields and names.

        :param sdf: Spark DataFrame to be managed.
        :param index_map: list of string pair
                           Each pair holds the index field name which exists in Spark fields,
                           and the index name.
        :param scol: Spark Column to be managed.
        :param data_columns: list of string
                              Field names to appear as columns. If scol is not None, this
                              argument is ignored, otherwise if this is None, calculated from sdf.
        :param column_index: list of tuples with the same length
                              The multi-level values in the tuples.
        :param column_index_names: Names for each of the index levels.
        """
        assert isinstance(sdf, spark.DataFrame)
        if index_map is None:
            # Here is when Koalas DataFrame is created directly from Spark DataFrame.
            assert "__index_level_0__" not in sdf.schema.names, \
                "Default index column should not appear in columns of the Spark DataFrame"

            # Create default index.
            index_map = [('__index_level_0__', None)]
            sdf = _InternalFrame.attach_default_index(sdf)

        assert index_map is not None
        assert all(isinstance(index_field, str)
                   and (index_name is None or (isinstance(index_name, tuple)
                                               and all(isinstance(name, str)
                                                       for name in index_name)))
                   for index_field, index_name in index_map), index_map
        assert scol is None or isinstance(scol, spark.Column)
        assert data_columns is None or all(isinstance(col, str) for col in data_columns)

        self._sdf = sdf  # type: spark.DataFrame
        self._index_map = index_map  # type: List[IndexMap]
        self._scol = scol  # type: Optional[spark.Column]
        if scol is not None:
            self._data_columns = sdf.select(scol).columns
        elif data_columns is None:
            index_columns = set(index_column for index_column, _ in self._index_map)
            self._data_columns = [column for column in sdf.columns if column not in index_columns]
        else:
            self._data_columns = data_columns

        if scol is not None:
            assert column_index is not None and len(column_index) == 1, column_index
            assert all(idx is None or (isinstance(idx, tuple) and len(idx) > 0)
                       for idx in column_index), column_index
            self._column_index = column_index
        elif column_index is None:
            self._column_index = [(col,) for col in self._data_columns]
        else:
            assert len(column_index) == len(self._data_columns)
            assert all(isinstance(i, tuple) for i in column_index), column_index
            assert len(set(len(i) for i in column_index)) <= 1, column_index
            self._column_index = column_index

        if column_index_names is not None and not is_list_like(column_index_names):
            raise ValueError('Column_index_names should be list-like or None for a MultiIndex')

        if isinstance(column_index_names, list):
            if all(name is None for name in column_index_names):
                self._column_index_names = None
            else:
                self._column_index_names = column_index_names
        else:
            self._column_index_names = column_index_names
Ejemplo n.º 10
0
def evaluate(aesthetics, data, env):
    """
    Evaluate aesthetics

    Parameters
    ----------
    aesthetics : dict-like
        Aesthetics to evaluate. They must be of the form {name: expr}
    data : pd.DataFrame
        Dataframe whose columns are/may-be variables in the aesthetic
        expressions i.e. it is a namespace with variables.
    env : ~patsy.Eval.Environment
        Environment in which the aesthetics are evaluated

    Returns
    -------
    evaled : pd.DataFrame
        Dataframe of the form {name: result}, where each column is the
        result from evaluating an expression.

    Examples
    --------
    >>> import patsy
    >>> var1 = 2
    >>> env = patsy.eval.EvalEnvironment.capture()
    >>> df = pd.DataFrame({'x': range(1, 6)})
    >>> aesthetics = {'y': 'x**var1'}
    >>> evaluate(aesthetics, df, env)
        y
    0   1
    1   4
    2   9
    3  16
    4  25
    """
    env = env.with_outer_namespace(AES_INNER_NAMESPACE)

    # Using `type` preserves the subclass of pd.DataFrame
    evaled = type(data)(index=data.index)

    # If a column name is not in the data, it is evaluated/transformed
    # in the environment of the call to ggplot
    for ae, col in aesthetics.items():
        if isinstance(col, str):
            if col in data:
                evaled[ae] = data[col]
            else:
                try:
                    new_val = env.eval(col, inner_namespace=data)
                except Exception as e:
                    raise PlotnineError(_TPL_EVAL_FAIL.format(ae, col, str(e)))

                try:
                    evaled[ae] = new_val
                except Exception as e:
                    raise PlotnineError(
                        _TPL_BAD_EVAL_TYPE.format(ae, col, str(type(new_val)),
                                                  str(e)))
        elif pdtypes.is_list_like(col):
            n = len(col)
            if len(data) and n != len(data) and n != 1:
                raise PlotnineError("Aesthetics must either be length one, " +
                                    "or the same length as the data")
            # An empty dataframe does not admit a scalar value
            elif len(evaled) and n == 1:
                col = col[0]
            evaled[ae] = col
        elif is_known_scalar(col):
            if not len(evaled):
                col = [col]
            evaled[ae] = col
        else:
            msg = "Do not know how to deal with aesthetic '{}'"
            raise PlotnineError(msg.format(ae))

    return evaled
Ejemplo n.º 11
0
    def __init__(
        self,
        spark_frame: spark.DataFrame,
        index_map: Optional[Dict[str, Optional[Tuple[str, ...]]]],
        column_labels: Optional[List[Tuple[str, ...]]] = None,
        data_spark_columns: Optional[List[spark.Column]] = None,
        column_label_names: Optional[List[str]] = None,
        spark_column: Optional[spark.Column] = None,
    ) -> None:
        """
        Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and
        index fields and names.

        :param spark_frame: Spark DataFrame to be managed.
        :param index_map: dictionary of string pairs
                           Each pair holds the index field name which exists in Spark fields,
                           and the index name.
        :param column_labels: list of tuples with the same length
                              The multi-level values in the tuples.
        :param data_spark_columns: list of Spark Column
                                   Spark Columns to appear as columns. If spark_column is not None,
                                   this argument is ignored, otherwise if this is None, calculated
                                   from spark_frame.
        :param column_label_names: Names for each of the index levels.
        :param spark_column: Spark Column to be managed.

        See the examples below to refer what each parameter means.

        >>> column_labels = pd.MultiIndex.from_tuples(
        ...     [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"])
        >>> row_index = pd.MultiIndex.from_tuples(
        ...     [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')],
        ...     names=["row_index_a", "row_index_b"])
        >>> kdf = ks.DataFrame(
        ...     [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels)
        >>> kdf.set_index(('a', 'x'), append=True, inplace=True)
        >>> kdf  # doctest: +NORMALIZE_WHITESPACE
        column_labels_a                  a  b
        column_labels_b                  y  z
        row_index_a row_index_b (a, x)
        foo         bar         1       2  3
                                4       5  6
        zoo         bar         7       8  9

        >>> internal = kdf[('a', 'y')]._internal

        >>> internal._sdf.show()  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
        +-----------------+-----------------+------+------+------+...
        |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|...
        +-----------------+-----------------+------+------+------+...
        |              foo|              bar|     1|     2|     3|...
        |              foo|              bar|     4|     5|     6|...
        |              zoo|              bar|     7|     8|     9|...
        +-----------------+-----------------+------+------+------+...

        >>> internal._index_map  # doctest: +NORMALIZE_WHITESPACE
        OrderedDict([('__index_level_0__', ('row_index_a',)),
         ('__index_level_1__', ('row_index_b',)),
         ('(a, x)', ('a', 'x'))])

        >>> internal._column_labels
        [('a', 'y')]

        >>> internal._data_spark_columns
        [Column<b'(a, y)'>]

        >>> list(internal._column_label_names)
        ['column_labels_a', 'column_labels_b']

        >>> internal.spark_column
        Column<b'(a, y)'>
        """

        assert isinstance(spark_frame, spark.DataFrame)
        assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming."

        if index_map is None:
            assert not any(
                SPARK_INDEX_NAME_PATTERN.match(name)
                for name in spark_frame.columns
            ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid "
                "index column names [%s]." % SPARK_INDEX_NAME_PATTERN)

            # Create default index.
            spark_frame = _InternalFrame.attach_default_index(spark_frame)
            index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None})

        if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns:
            spark_frame = spark_frame.withColumn(
                NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id())

        assert isinstance(index_map, OrderedDict), index_map
        assert all(
            isinstance(index_field, str) and (
                index_name is None or (isinstance(index_name, tuple) and all(
                    isinstance(name, str) for name in index_name)))
            for index_field, index_name in index_map.items()), index_map
        assert spark_column is None or isinstance(spark_column, spark.Column)
        assert data_spark_columns is None or all(
            isinstance(scol, spark.Column) for scol in data_spark_columns)

        self._sdf = spark_frame  # type: spark.DataFrame
        self._index_map = index_map  # type: Dict[str, Optional[Tuple[str, ...]]]
        self._spark_column = spark_column  # type: Optional[spark.Column]
        if spark_column is not None:
            self._data_spark_columns = [spark_column]
        elif data_spark_columns is None:
            index_columns = set(index_column
                                for index_column in self._index_map)
            self._data_spark_columns = [
                scol_for(spark_frame, col) for col in spark_frame.columns
                if col not in index_columns and col not in HIDDEN_COLUMNS
            ]
        else:
            self._data_spark_columns = data_spark_columns

        if spark_column is not None:
            assert column_labels is not None and len(
                column_labels) == 1, column_labels
            assert all(
                label is None or (isinstance(label, tuple) and len(label) > 0)
                for label in column_labels), column_labels
            self._column_labels = column_labels
        elif column_labels is None:
            self._column_labels = [(spark_frame.select(scol).columns[0], )
                                   for scol in self._data_spark_columns]
        else:
            assert len(column_labels) == len(self._data_spark_columns), (
                len(column_labels),
                len(self._data_spark_columns),
            )
            assert all(isinstance(i, tuple)
                       for i in column_labels), column_labels
            assert len(set(len(i) for i in column_labels)) <= 1, column_labels
            self._column_labels = column_labels

        if column_label_names is not None and not is_list_like(
                column_label_names):
            raise ValueError(
                "Column_index_names should be list-like or None for a MultiIndex"
            )

        if isinstance(column_label_names, list):
            if all(name is None for name in column_label_names):
                self._column_label_names = None
            else:
                self._column_label_names = column_label_names
        else:
            self._column_label_names = column_label_names
Ejemplo n.º 12
0
def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
    """
    Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
    must have category dtype to infer result's ``columns``.
    ``index``, ``columns``, and ``aggfunc`` must be all scalar.
    ``values`` can be scalar or list-like.

    Parameters
    ----------
    df : DataFrame
    index : scalar
        column to be index
    columns : scalar
        column to be columns
    values : scalar or list(scalar)
        column(s) to aggregate
    aggfunc : {'mean', 'sum', 'count'}, default 'mean'

    Returns
    -------
    table : DataFrame

    See Also
    --------
    pandas.DataFrame.pivot_table
    """

    if not is_scalar(index) or index is None:
        raise ValueError("'index' must be the name of an existing column")
    if not is_scalar(columns) or columns is None:
        raise ValueError("'columns' must be the name of an existing column")
    if not is_categorical_dtype(df[columns]):
        raise ValueError("'columns' must be category dtype")
    if not has_known_categories(df[columns]):
        raise ValueError("'columns' must have known categories. Please use "
                         "`df[columns].cat.as_known()` beforehand to ensure "
                         "known categories")
    if not (is_list_like(values) and all([is_scalar(v) for v in values])
            or is_scalar(values)):
        raise ValueError(
            "'values' must refer to an existing column or columns")
    if not is_scalar(aggfunc) or aggfunc not in ("mean", "sum", "count"):
        raise ValueError("aggfunc must be either 'mean', 'sum' or 'count'")

    # _emulate can't work for empty data
    # the result must have CategoricalIndex columns

    columns_contents = pd.CategoricalIndex(df[columns].cat.categories,
                                           name=columns)
    if is_scalar(values):
        new_columns = columns_contents
    else:
        new_columns = pd.MultiIndex.from_product(
            (sorted(values), columns_contents), names=[None, columns])

    meta = pd.DataFrame(columns=new_columns,
                        dtype=np.float64,
                        index=pd.Index(df._meta[index]))

    kwargs = {"index": index, "columns": columns, "values": values}

    if aggfunc in ["sum", "mean"]:
        pv_sum = apply_concat_apply(
            [df],
            chunk=methods.pivot_sum,
            aggregate=methods.pivot_agg,
            meta=meta,
            token="pivot_table_sum",
            chunk_kwargs=kwargs,
        )

    if aggfunc in ["count", "mean"]:
        pv_count = apply_concat_apply(
            [df],
            chunk=methods.pivot_count,
            aggregate=methods.pivot_agg,
            meta=meta,
            token="pivot_table_count",
            chunk_kwargs=kwargs,
        )

    if aggfunc == "sum":
        return pv_sum
    elif aggfunc == "count":
        return pv_count
    elif aggfunc == "mean":
        return pv_sum / pv_count
    else:
        raise ValueError
Ejemplo n.º 13
0
    def intersection(
            self, other: Union[DataFrame, Series, Index,
                               List]) -> "MultiIndex":
        """
        Form the intersection of two Index objects.

        This returns a new Index with elements common to the index and `other`.

        Parameters
        ----------
        other : Index or array-like

        Returns
        -------
        intersection : MultiIndex

        Examples
        --------
        >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        if isinstance(other, Series) or not is_list_like(other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        elif isinstance(other, DataFrame):
            raise ValueError("Index data must be 1-dimensional")
        elif isinstance(other, MultiIndex):
            spark_frame_other = other.to_frame().to_spark()
            keep_name = self.names == other.names
        elif isinstance(other, Index):
            # Always returns an empty MultiIndex if `other` is Index.
            return cast(MultiIndex, self.to_frame().head(0).index)
        elif not all(isinstance(item, tuple) for item in other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        else:
            other = MultiIndex.from_tuples(list(other))
            spark_frame_other = cast(MultiIndex, other).to_frame().to_spark()
            keep_name = True

        index_fields = self._index_fields_for_union_like(
            other, func_name="intersection")

        default_name = [
            SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)
        ]  # type: List
        spark_frame_self = self.to_frame(name=default_name).to_spark()
        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
        if keep_name:
            index_names = self._internal.index_names
        else:
            index_names = None

        internal = InternalFrame(
            spark_frame=spark_frame_intersected,
            index_spark_columns=[
                scol_for(spark_frame_intersected, col) for col in default_name
            ],
            index_names=index_names,
            index_fields=index_fields,
        )
        return cast(MultiIndex, DataFrame(internal).index)
Ejemplo n.º 14
0
    def bar(self,
            subset=None,
            axis=0,
            color='#d65f5f',
            width=100,
            align='left'):
        """
        Color the background ``color`` proptional to the values in each column.
        Excludes non-numeric data by default.

        .. versionadded:: 0.17.1

        Parameters
        ----------
        subset: IndexSlice, default None
            a valid slice for ``data`` to limit the style application to
        axis: int
        color: str or 2-tuple/list
            If a str is passed, the color is the same for both
            negative and positive numbers. If 2-tuple/list is used, the
            first element is the color_negative and the second is the
            color_positive (eg: ['#d65f5f', '#5fba7d'])
        width: float
            A number between 0 or 100. The largest value will cover ``width``
            percent of the cell's width
        align : {'left', 'zero',' mid'}, default 'left'
            - 'left' : the min value starts at the left of the cell
            - 'zero' : a value of zero is located at the center of the cell
            - 'mid' : the center of the cell is at (max-min)/2, or
              if values are all negative (positive) the zero is aligned
              at the right (left) of the cell

              .. versionadded:: 0.20.0

        Returns
        -------
        self : Styler
        """
        subset = _maybe_numeric_slice(self.data, subset)
        subset = _non_reducing_slice(subset)

        base = 'width: 10em; height: 80%;'

        if not (is_list_like(color)):
            color = [color, color]
        elif len(color) == 1:
            color = [color[0], color[0]]
        elif len(color) > 2:
            msg = ("Must pass `color` as string or a list-like"
                   " of length 2: [`color_negative`, `color_positive`]\n"
                   "(eg: color=['#d65f5f', '#5fba7d'])")
            raise ValueError(msg)

        if align == 'left':
            self.apply(self._bar_left,
                       subset=subset,
                       axis=axis,
                       color=color,
                       width=width,
                       base=base)
        elif align == 'zero':
            self.apply(self._bar_center_zero,
                       subset=subset,
                       axis=axis,
                       color=color,
                       width=width,
                       base=base)
        elif align == 'mid':
            self.apply(self._bar_center_mid,
                       subset=subset,
                       axis=axis,
                       color=color,
                       width=width,
                       base=base)
        else:
            msg = ("`align` must be one of {'left', 'zero',' mid'}")
            raise ValueError(msg)

        return self
Ejemplo n.º 15
0
    def remove_categories(self,
                          removals: Union[pd.Index, Any, List],
                          inplace: bool = False) -> Optional["ps.Series"]:
        """
        Remove the specified categories.

        `removals` must be included in the old categories. Values which were in
        the removed categories will be set to NaN

        Parameters
        ----------
        removals : category or list of categories
           The categories which should be removed.
        inplace : bool, default False
           Whether or not to remove the categories inplace or return a copy of
           this categorical with removed categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the removals are not contained in the categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.remove_categories('b')  # doctest: +SKIP
        0      a
        1    NaN
        2    NaN
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['a', 'c']
        """
        if inplace:
            warnings.warn(
                "The `inplace` parameter in remove_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        categories: List[Any]
        if is_list_like(removals):
            categories = [cat for cat in removals if cat is not None]
        elif removals is None:
            categories = []
        else:
            categories = [removals]

        if any(cat not in self.categories for cat in categories):
            raise ValueError(
                "removals must all be in old categories: {{{cats}}}".format(
                    cats=", ".join(
                        set(
                            str(cat) for cat in categories
                            if cat not in self.categories))))

        if len(categories) == 0:
            if inplace:
                return None
            else:
                return self._data.copy()
        else:
            dtype = CategoricalDtype(
                [cat for cat in self.categories if cat not in categories],
                ordered=self.ordered)
            psser = self._data.astype(dtype)

            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
Ejemplo n.º 16
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.indexes import MultiIndex
        from databricks.koalas.series import Series

        if isinstance(rows_sel, Series):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            assert len(self._internal.index_spark_column_names) > 0
            if rows_sel.step is not None:
                LocIndexer._raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif len(self._internal.index_spark_column_names) == 1:
                sdf = self._internal.spark_frame
                index = self._kdf_or_kser.index
                index_column = index.to_series()
                index_data_type = index_column.spark_type
                start = rows_sel.start
                stop = rows_sel.stop

                # get natural order from '__natural_order__' from start to stop
                # to keep natural order.
                start_and_stop = (sdf.select(
                    index_column._scol, NATURAL_ORDER_COLUMN_NAME
                ).where(
                    (index_column._scol == F.lit(start).cast(index_data_type))
                    | (index_column._scol == F.lit(stop).cast(index_data_type))
                ).collect())

                start = [row[1] for row in start_and_stop if row[0] == start]
                start = start[0] if len(start) > 0 else None

                stop = [row[1] for row in start_and_stop if row[0] == stop]
                stop = stop[-1] if len(stop) > 0 else None

                cond = []
                if start is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(
                            LongType()))
                if stop is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(
                            LongType()))

                # if index order is not monotonic increasing or decreasing
                # and specified values don't exist in index, raise KeyError
                if (start is None and rows_sel.start is not None) or (
                        stop is None and rows_sel.stop is not None):

                    inc = index_column.is_monotonic_increasing
                    if inc is False:
                        dec = index_column.is_monotonic_decreasing

                    if start is None and rows_sel.start is not None:
                        start = rows_sel.start
                        if inc is not False:
                            cond.append(index_column._scol >= F.lit(
                                start).cast(index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol <= F.lit(
                                start).cast(index_data_type))
                        else:
                            raise KeyError(rows_sel.start)
                    if stop is None and rows_sel.stop is not None:
                        stop = rows_sel.stop
                        if inc is not False:
                            cond.append(index_column._scol <= F.lit(stop).cast(
                                index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol >= F.lit(stop).cast(
                                index_data_type))
                        else:
                            raise KeyError(rows_sel.stop)

                return reduce(lambda x, y: x & y, cond), None, None
            else:
                index = self._kdf_or_kser.index
                index_data_type = [
                    f.dataType for f in index.to_series().spark_type
                ]

                start = rows_sel.start
                if start is not None:
                    if not isinstance(start, tuple):
                        start = (start, )
                    if len(start) == 0:
                        start = None
                stop = rows_sel.stop
                if stop is not None:
                    if not isinstance(stop, tuple):
                        stop = (stop, )
                    if len(stop) == 0:
                        stop = None

                depth = max(
                    len(start) if start is not None else 0,
                    len(stop) if stop is not None else 0)
                if depth == 0:
                    return None, None, None
                elif (depth > len(self._internal.index_map)
                      or not index.droplevel(
                          list(range(len(self._internal.index_map))
                               [depth:])).is_monotonic):
                    raise KeyError(
                        "Key length ({}) was greater than MultiIndex sort depth"
                        .format(depth))

                conds = []
                if start is not None:
                    cond = F.lit(True)
                    for scol, value, dt in list(
                            zip(self._internal.index_spark_columns, start,
                                index_data_type))[::-1]:
                        compare = MultiIndex._comparator_for_monotonic_increasing(
                            dt)
                        cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)),
                                      cond).otherwise(
                                          compare(scol,
                                                  F.lit(value).cast(dt),
                                                  spark.Column.__gt__))
                    conds.append(cond)
                if stop is not None:
                    cond = F.lit(True)
                    for scol, value, dt in list(
                            zip(self._internal.index_spark_columns, stop,
                                index_data_type))[::-1]:
                        compare = MultiIndex._comparator_for_monotonic_increasing(
                            dt)
                        cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)),
                                      cond).otherwise(
                                          compare(scol,
                                                  F.lit(value).cast(dt),
                                                  spark.Column.__lt__))
                    conds.append(cond)

                return reduce(lambda x, y: x & y, conds), None, None
        elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple):
            rows_sel = list(rows_sel)
            if len(rows_sel) == 0:
                return F.lit(False), None, None
            elif len(self._internal.index_spark_column_names) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return (
                        index_column._scol == F.lit(
                            rows_sel[0]).cast(index_data_type),
                        None,
                        None,
                    )
                else:
                    return (
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]),
                        None,
                        None,
                    )
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        else:
            if not isinstance(rows_sel, tuple):
                rows_sel = (rows_sel, )
            if len(rows_sel) > len(self._internal.index_map):
                raise SparkPandasIndexingError("Too many indexers")

            rows = [
                scol == value for scol, value in zip(
                    self._internal.index_spark_columns, rows_sel)
            ]
            return (
                reduce(lambda x, y: x & y, rows),
                None,
                len(self._internal.index_map) - len(rows_sel),
            )
Ejemplo n.º 17
0
    def rename_categories(self,
                          new_categories: Union[list, dict, Callable],
                          inplace: bool = False) -> Optional["ps.Series"]:
        """
        Rename categories.

        Parameters
        ----------
        new_categories : list-like, dict-like or callable

            New categories which will replace old categories.

            * list-like: all items must be unique and the number of items in
              the new categories must match the existing number of categories.

            * dict-like: specifies a mapping from
              old categories to new. Categories not contained in the mapping
              are passed through and extra categories in the mapping are
              ignored.

            * callable : a callable that is called on all items in the old
              categories and whose return values comprise the new categories.

        inplace : bool, default False
            Whether or not to rename the categories inplace or return a copy of
            this categorical with renamed categories.

            .. deprecated:: 3.2.0

        Returns
        -------
        cat : Series or None
            Categorical with removed categories or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If new categories are list-like and do not have the same number of
            items than the current categories or do not validate as categories

        See Also
        --------
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(["a", "a", "b"], dtype="category")
        >>> s.cat.rename_categories([0, 1])  # doctest: +SKIP
        0    0
        1    0
        2    1
        dtype: category
        Categories (2, int64): [0, 1]

        For dict-like ``new_categories``, extra keys are ignored and
        categories not in the dictionary are passed through

        >>> s.cat.rename_categories({'a': 'A', 'c': 'C'})  # doctest: +SKIP
        0    A
        1    A
        2    b
        dtype: category
        Categories (2, object): ['A', 'b']

        You may also provide a callable to create the new categories

        >>> s.cat.rename_categories(lambda x: x.upper())  # doctest: +SKIP
        0    A
        1    A
        2    B
        dtype: category
        Categories (2, object): ['A', 'B']
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in rename_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if is_dict_like(new_categories):
            categories = [
                cast(dict, new_categories).get(item, item)
                for item in self.categories
            ]
        elif callable(new_categories):
            categories = [new_categories(item) for item in self.categories]
        elif is_list_like(new_categories):
            if len(self.categories) != len(new_categories):
                raise ValueError(
                    "new categories need to have the same number of items as the old categories!"
                )
            categories = cast(list, new_categories)
        else:
            raise TypeError(
                "new_categories must be list-like, dict-like or callable.")

        internal = self._data._psdf._internal.with_new_spark_column(
            self._data._column_label,
            self._data.spark.column,
            field=self._data._internal.data_fields[0].copy(
                dtype=CategoricalDtype(categories=categories,
                                       ordered=self.ordered)),
        )

        if inplace:
            self._data._psdf._update_internal_frame(internal)
            return None
        else:
            return DataFrame(internal)._psser_for(
                self._data._column_label).copy()
Ejemplo n.º 18
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.series import Series

        if isinstance(rows_sel, Series):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            assert len(self._internal.index_spark_column_names) > 0
            if rows_sel.step is not None:
                LocIndexer._raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif len(self._internal.index_spark_column_names) == 1:
                sdf = self._internal.spark_frame
                index = self._kdf_or_kser.index
                index_column = index.to_series()
                index_data_type = index_column.spark_type
                start = rows_sel.start
                stop = rows_sel.stop

                # get natural order from '__natural_order__' from start to stop
                # to keep natural order.
                start_and_stop = (sdf.select(
                    index_column._scol, NATURAL_ORDER_COLUMN_NAME
                ).where(
                    (index_column._scol == F.lit(start).cast(index_data_type))
                    | (index_column._scol == F.lit(stop).cast(index_data_type))
                ).collect())

                start = [row[1] for row in start_and_stop if row[0] == start]
                start = start[0] if len(start) > 0 else None

                stop = [row[1] for row in start_and_stop if row[0] == stop]
                stop = stop[-1] if len(stop) > 0 else None

                cond = []
                if start is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(
                            LongType()))
                if stop is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(
                            LongType()))

                # if index order is not monotonic increasing or decreasing
                # and specified values don't exist in index, raise KeyError
                if (start is None and rows_sel.start is not None) or (
                        stop is None and rows_sel.stop is not None):
                    inc, dec = (sdf.select(
                        index_column._is_monotonic()._scol.alias(
                            "__increasing__"),
                        index_column._is_monotonic_decreasing()._scol.alias(
                            "__decreasing__"),
                    ).select(
                        F.min(F.coalesce("__increasing__", F.lit(True))),
                        F.min(F.coalesce("__decreasing__", F.lit(True))),
                    ).first())
                    if start is None and rows_sel.start is not None:
                        start = rows_sel.start
                        if inc is not False:
                            cond.append(index_column._scol >= F.lit(
                                start).cast(index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol <= F.lit(
                                start).cast(index_data_type))
                        else:
                            raise KeyError(rows_sel.start)
                    if stop is None and rows_sel.stop is not None:
                        stop = rows_sel.stop
                        if inc is not False:
                            cond.append(index_column._scol <= F.lit(stop).cast(
                                index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol >= F.lit(stop).cast(
                                index_data_type))
                        else:
                            raise KeyError(rows_sel.stop)

                if len(cond) > 0:
                    return reduce(lambda x, y: x & y, cond), None, None
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple):
            rows_sel = list(rows_sel)
            if len(rows_sel) == 0:
                return F.lit(False), None, None
            elif len(self._internal.index_spark_column_names) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return (
                        index_column._scol == F.lit(
                            rows_sel[0]).cast(index_data_type),
                        None,
                        None,
                    )
                else:
                    return (
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]),
                        None,
                        None,
                    )
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        else:
            if not isinstance(rows_sel, tuple):
                rows_sel = (rows_sel, )
            if len(rows_sel) > len(self._internal.index_map):
                raise SparkPandasIndexingError("Too many indexers")

            rows = [
                scol == value for scol, value in zip(
                    self._internal.index_spark_columns, rows_sel)
            ]
            return (
                reduce(lambda x, y: x & y, rows),
                None,
                len(self._internal.index_map) - len(rows_sel),
            )
Ejemplo n.º 19
0
    def set_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        rename: bool = False,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Set the categories to the specified new_categories.

        `new_categories` can include new categories (which will result in
        unused categories) or remove old categories (which results in values
        set to NaN). If `rename==True`, the categories will simple be renamed
        (less or more items than in old categories will result in values set to
        NaN or in unused categories respectively).

        This method can be used to perform more than one action of adding,
        removing, and reordering simultaneously and is therefore faster than
        performing the individual steps via the more specialised methods.

        On the other hand this methods does not do checks (e.g., whether the
        old categories are included in the new categories on a reorder), which
        can result in surprising changes, for example when using special string
        dtypes, which does not considers a S1 string equal to a single char
        python string.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, default False
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        rename : bool, default False
           Whether or not the new_categories should be considered as a rename
           of the old categories or as reordered categories.
        inplace : bool, default False
           Whether or not to reorder the categories in-place or return a copy
           of this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series with reordered categories or None if inplace.

        Raises
        ------
        ValueError
            If new_categories does not validate as categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.set_categories(['b', 'c'])  # doctest: +SKIP
        0    NaN
        1      b
        2      b
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['b', 'c']

        >>> s.cat.set_categories([1, 2, 3], rename=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1, 2, 3]

        >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1 < 2 < 3]
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in set_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".
                format(new_categories))

        if ordered is None:
            ordered = self.ordered

        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
        scol = self._data.spark.column

        if rename:
            new_scol = (F.when(
                scol >= len(new_categories),
                SF.lit(-1).cast(
                    self._data.spark.data_type)).otherwise(scol).alias(
                        self._data._internal.data_spark_column_names[0]))

            internal = self._data._psdf._internal.with_new_spark_column(
                self._data._column_label,
                new_scol,
                field=self._data._internal.data_fields[0].copy(
                    dtype=new_dtype),
            )

            if inplace:
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return DataFrame(internal)._psser_for(
                    self._data._column_label).copy()
        else:
            psser = self._data.astype(new_dtype)
            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser
Ejemplo n.º 20
0
    def from_frame(df, names=None) -> "MultiIndex":
        """
        Make a MultiIndex from a DataFrame.

        Parameters
        ----------
        df : DataFrame
            DataFrame to be converted to MultiIndex.
        names : list-like, optional
            If no names are provided, use the column names, or tuple of column
            names if the columns is a MultiIndex. If a sequence, overwrite
            names with the given sequence.

        Returns
        -------
        MultiIndex
            The MultiIndex representation of the given DataFrame.

        See Also
        --------
        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
        MultiIndex.from_product : Make a MultiIndex from cartesian product
                                  of iterables.

        Examples
        --------
        >>> df = ks.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
        ...                    ['NJ', 'Temp'], ['NJ', 'Precip']],
        ...                   columns=['a', 'b'])
        >>> df  # doctest: +SKIP
              a       b
        0    HI    Temp
        1    HI  Precip
        2    NJ    Temp
        3    NJ  Precip

        >>> ks.MultiIndex.from_frame(df)  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['a', 'b'])

        Using explicit names, instead of the column names

        >>> ks.MultiIndex.from_frame(df, names=['state', 'observation'])  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['state', 'observation'])
        """
        if not isinstance(df, DataFrame):
            raise TypeError("Input must be a DataFrame")
        sdf = df.to_spark()

        if names is None:
            names = df._internal.column_labels
        elif not is_list_like(names):
            raise ValueError("Names should be list-like for a MultiIndex")
        else:
            names = [name if is_name_like_tuple(name) else (name,) for name in names]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in sdf.columns],
            index_names=names,
        )
        return cast(MultiIndex, DataFrame(internal).index)
Ejemplo n.º 21
0
def is_boolean_array(x):
    return is_list_like(x) and all(map(is_bool, x))
Ejemplo n.º 22
0
    def to_frame(self, index=True, name=None) -> DataFrame:
        """
        Create a DataFrame with the levels of the MultiIndex as columns.
        Column ordering is determined by the DataFrame constructor with data as
        a dict.

        Parameters
        ----------
        index : boolean, default True
            Set the index of the returned DataFrame as the original MultiIndex.
        name : list / sequence of strings, optional
            The passed names should substitute index level names.

        Returns
        -------
        DataFrame : a DataFrame containing the original MultiIndex data.

        See Also
        --------
        DataFrame

        Examples
        --------
        >>> tuples = [(1, 'red'), (1, 'blue'),
        ...           (2, 'red'), (2, 'blue')]
        >>> idx = ks.MultiIndex.from_tuples(tuples, names=('number', 'color'))
        >>> idx  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        >>> idx.to_frame()  # doctest: +NORMALIZE_WHITESPACE
                      number color
        number color
        1      red         1   red
               blue        1  blue
        2      red         2   red
               blue        2  blue

        By default, the original Index is reused. To enforce a new Index:

        >>> idx.to_frame(index=False)
           number color
        0       1   red
        1       1  blue
        2       2   red
        3       2  blue

        To override the name of the resulting column, specify `name`:

        >>> idx.to_frame(name=['n', 'c'])  # doctest: +NORMALIZE_WHITESPACE
                      n     c
        number color
        1      red    1   red
               blue   1  blue
        2      red    2   red
               blue   2  blue
        """
        if name is None:
            name = [
                name if name is not None else (i,)
                for i, name in enumerate(self._internal.index_names)
            ]
        elif is_list_like(name):
            if len(name) != self._internal.index_level:
                raise ValueError("'name' should have same length as number of levels on index.")
            name = [n if is_name_like_tuple(n) else (n,) for n in name]
        else:
            raise TypeError("'name' must be a list / sequence of column names.")

        return self._to_frame(index=index, names=name)
Ejemplo n.º 23
0
    def bar(
        self,
        subset=None,
        axis=0,
        color="#d65f5f",
        width=100,
        align="left",
        vmin=None,
        vmax=None,
    ):
        """
        Draw bar chart in the cell backgrounds.

        Parameters
        ----------
        subset : IndexSlice, optional
            A valid slice for `data` to limit the style application to.
        axis : {0 or 'index', 1 or 'columns', None}, default 0
            Apply to each column (``axis=0`` or ``'index'``), to each row
            (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
            with ``axis=None``.
        color : str or 2-tuple/list
            If a str is passed, the color is the same for both
            negative and positive numbers. If 2-tuple/list is used, the
            first element is the color_negative and the second is the
            color_positive (eg: ['#d65f5f', '#5fba7d']).
        width : float, default 100
            A number between 0 or 100. The largest value will cover `width`
            percent of the cell's width.
        align : {'left', 'zero',' mid'}, default 'left'
            How to align the bars with the cells.

            - 'left' : the min value starts at the left of the cell.
            - 'zero' : a value of zero is located at the center of the cell.
            - 'mid' : the center of the cell is at (max-min)/2, or
              if values are all negative (positive) the zero is aligned
              at the right (left) of the cell.
        vmin : float, optional
            Minimum bar value, defining the left hand limit
            of the bar drawing range, lower values are clipped to `vmin`.
            When None (default): the minimum value of the data will be used.

            .. versionadded:: 0.24.0

        vmax : float, optional
            Maximum bar value, defining the right hand limit
            of the bar drawing range, higher values are clipped to `vmax`.
            When None (default): the maximum value of the data will be used.

            .. versionadded:: 0.24.0

        Returns
        -------
        self : Styler
        """
        if align not in ("left", "zero", "mid"):
            raise ValueError("`align` must be one of {'left', 'zero',' mid'}")

        if not (is_list_like(color)):
            color = [color, color]
        elif len(color) == 1:
            color = [color[0], color[0]]
        elif len(color) > 2:
            raise ValueError("`color` must be string or a list-like "
                             "of length 2: [`color_neg`, `color_pos`] "
                             "(eg: color=['#d65f5f', '#5fba7d'])")

        subset = _maybe_numeric_slice(self.data, subset)
        subset = _non_reducing_slice(subset)
        self.apply(
            self._bar,
            subset=subset,
            axis=axis,
            align=align,
            colors=color,
            width=width,
            vmin=vmin,
            vmax=vmax,
        )

        return self
Ejemplo n.º 24
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.indexes import Index
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".iloc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Index):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif (rows_sel.start is not None) or (rows_sel.step is not None):
                raiseNotImplemented("Cannot use start or step with Spark.")
            elif not isinstance(rows_sel.stop, int):
                raise TypeError(
                    "cannot do slice indexing with these indexers [{}] of {}".
                    format(rows_sel.stop, type(rows_sel.stop)))
            elif rows_sel.stop >= 0:
                sdf = sdf.limit(rows_sel.stop)
            else:
                sdf = sdf.limit(sdf.count() + rows_sel.stop)
        else:
            raiseNotImplemented(
                ".iloc requires numeric slice or conditional boolean Index, "
                "got {}".format(rows_sel))

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            columns = [cols_sel._scol]
        elif isinstance(cols_sel, int):
            columns = [self._kdf._internal.data_scols[cols_sel]]
        elif cols_sel is None or cols_sel == slice(None):
            columns = self._kdf._internal.data_scols
        elif isinstance(cols_sel, slice):
            if all(s is None or isinstance(s, int)
                   for s in (cols_sel.start, cols_sel.stop, cols_sel.step)):
                columns = self._kdf._internal.data_scols[cols_sel]
            else:
                not_none = cols_sel.start if cols_sel.start is not None \
                    else cols_sel.stop if cols_sel.stop is not None else cols_sel.step
                raise TypeError(
                    'cannot do slice indexing with these indexers {} of {}'.
                    format(not_none, type(not_none)))
        elif is_list_like(cols_sel):
            if all(isinstance(s, int) for s in cols_sel):
                columns = [
                    self._kdf._internal.scol_for(col)
                    for col in self._kdf.columns[cols_sel]
                ]
            else:
                raise TypeError('cannot perform reduce with flexible type')
        else:
            raise ValueError(
                "Location based indexing can only have [integer, integer slice, "
                "listlike of integers, boolean array] types, got {}".format(
                    cols_sel))

        try:
            sdf = sdf.select(self._kdf._internal.index_scols + columns)
            index_columns = self._kdf._internal.index_columns
            data_columns = [
                column for column in sdf.columns if column not in index_columns
            ]
            internal = _InternalFrame(sdf=sdf,
                                      data_columns=data_columns,
                                      index_map=self._kdf._internal.index_map)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))

        column_index = self._kdf._internal.column_index
        if cols_sel is not None:
            if isinstance(cols_sel, (Series, int)):
                column_index = None
            else:
                column_index = \
                    pd.MultiIndex.from_tuples(self._kdf._internal.column_index)[cols_sel].tolist()

        kdf = DataFrame(kdf._internal.copy(column_index=column_index))
        if cols_sel is not None and isinstance(cols_sel, (Series, int)):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
Ejemplo n.º 25
0
def _df_filter(ranger, lasso, header=0, skiprows=None, names=None,
               skip_footer=0, index_col=None, has_index_names=None,
               parse_cols=None, parse_dates=False, date_parser=None,
               na_values=None, thousands=None, convert_float=True,
               verbose=False, squeeze=False, **kwds):
    """
    Converts captured values table as pandas DataFrame

    Doc below copied from :func:`pandas.io.read_excel()`:

    header : int, list of ints, default 0
        Row (0-indexed) to use for the column labels of the parsed
        DataFrame. If a list of integers is passed those row positions will
        be combined into a ``MultiIndex``
    skiprows : list-like
        Rows to skip at the beginning (0-indexed)
    skip_footer : int, default 0
        Rows at the end to skip (0-indexed)
    index_col : int, list of ints, default None
        Column (0-indexed) to use as the row labels of the DataFrame.
        Pass None if there is no such column.  If a list is passed,
        those columns will be combined into a ``MultiIndex``
    names : array-like, default None
        List of column names to use. If file contains no header row,
        then you should explicitly pass header=None
    converters : dict, default None
        Dict of functions for converting values in certain columns. Keys can
        either be integers or column labels, values are functions that take one
        input argument, the Excel cell content, and return the transformed
        content.
    parse_cols : int or list, default None
        * If None then parse all columns,
        * If int then indicates last column to be parsed
        * If list of ints then indicates list of column numbers to be parsed
        * If string then indicates comma separated list of column names and
          column ranges (e.g. "A:E" or "A,C,E:F")
    squeeze : boolean, default False
        If the parsed data only contains one column then return a Series
    na_values : list-like, default None
        List of additional strings to recognize as NA/NaN
    thousands : str, default None
        Thousands separator for parsing string columns to numeric.  Note that
        this parameter is only necessary for columns stored as TEXT in Excel,
        any numeric columns will automatically be parsed, regardless of display
        format.
    keep_default_na : bool, default True
        If na_values are specified and keep_default_na is False the default NaN
        values are overridden, otherwise they're appended to
    verbose : boolean, default False
        Indicate number of NA values placed in non-numeric columns
    engine: string, default None
        If io is not a buffer or path, this must be set to identify io.
        Acceptable values are None or xlrd
    convert_float : boolean, default True
        convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
        data will be read in as floats: Excel stores all numbers as floats
        internally
    has_index_names : boolean, default None
        DEPRECATED: for version 0.17+ index names will be automatically
        inferred based on index_col.  To read Excel output from 0.16.2 and
        prior that had saved index names, use True.
    """
    data = lasso.values

    # Copied & adapted from `pandas.io.excel.py` v0.18.1
    #    https://github.com/pydata/pandas/releases/tag/v0.18.1

    skipfooter = kwds.pop('skipfooter', None)
    if skipfooter is not None:
        skip_footer = skipfooter

    _validate_header_arg(header)
    if has_index_names is not None:

        log.warning("\nThe has_index_names argument is deprecated; index names "
                    "will be automatically inferred based on index_col.\n"
                    "This argument is still necessary if reading Excel output "
                    "from 0.16.2 or prior with index names.")

    if 'chunksize' in kwds:
        raise NotImplementedError("chunksize keyword of read_excel "
                                  "is not implemented")
    if parse_dates:
        raise NotImplementedError("parse_dates keyword of read_excel "
                                  "is not implemented")

    if date_parser is not None:
        raise NotImplementedError("date_parser keyword of read_excel "
                                  "is not implemented")

    if not data:
        return pd.DataFrame()

    if pdtypes.is_list_like(header) and len(header) == 1:
        header = header[0]

    # forward fill and pull out names for MultiIndex column
    header_names = None
    if header is not None:
        if pdtypes.is_list_like(header):
            header_names = []
            control_row = [True for _ in data[0]]
            for row in header:
                if pdtypes.is_integer(skiprows):
                    row += skiprows
                try:
                    data[row], control_row = pdexcel._fill_mi_header(data[row], control_row)
                except TypeError:
                    ## Arg `control_row` introduced in pandas-v0.19.0 to fix
                    #  https://github.com/pandas-dev/pandas/issues/12453
                    #  https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505
                    data[row] = pdexcel._fill_mi_header(data[row])

                header_name, data[row] = pdexcel._pop_header_name(
                    data[row], index_col)
                header_names.append(header_name)
        else:
            data[header] = pdexcel._trim_excel_header(data[header])

    if pdtypes.is_list_like(index_col):
        # forward fill values for MultiIndex index
        if not pdtypes.is_list_like(header):
            offset = 1 + header
        else:
            offset = 1 + max(header)

        for col in index_col:
            last = data[offset][col]
            for row in range(offset + 1, len(data)):
                if data[row][col] == '' or data[row][col] is None:
                    data[row][col] = last
                else:
                    last = data[row][col]

    if pdtypes.is_list_like(header) and len(header) > 1:
        has_index_names = True

    # Pandaas expect '' instead of `None`!
    data = [['' if c is None else c for c in r] for r in data]

    # GH 12292 : error when read one empty column from excel file
    try:
        parser = pdparsers.TextParser(data, header=header, index_col=index_col,
                                      has_index_names=has_index_names,
                                      na_values=na_values,
                                      thousands=thousands,
                                      parse_dates=parse_dates,
                                      date_parser=date_parser,
                                      skiprows=skiprows,
                                      skip_footer=skip_footer,
                                      squeeze=squeeze,
                                      **kwds)

        output = parser.read()
        if names is not None:
            output.columns = names
        if not squeeze or isinstance(output, pd.DataFrame):
            output.columns = output.columns.set_names(header_names)
    except pdiocom.EmptyDataError:
        # No Data, return an empty DataFrame
        output = pd.DataFrame()

    lasso = lasso._replace(values=output)

    return lasso
Ejemplo n.º 26
0
 def from_1darray_quantity(cls, quantity):
     if not is_list_like(quantity.magnitude):
         raise TypeError("quantity's magnitude is not list like")
     return cls(quantity.magnitude, quantity.units)
Ejemplo n.º 27
0
def qplot(x=None, y=None, data=None, facets=None, margins=False,
          geom='auto', xlim=None, ylim=None, log='', main=None,
          xlab=None, ylab=None, asp=None, **kwargs):
    """
    Quick plot

    Parameters
    ----------
    x : str | array_like
        x aesthetic
    y : str | array_like
        y aesthetic
    data : dataframe
        Data frame to use (optional). If not specified,
        will create one, extracting arrays from the
        current environment.
    geom : str | list
        *geom(s)* to do the drawing. If ``auto``, defaults
        to 'point' if ``x`` and ``y`` are specified or
        'histogram' if only ``x`` is specified.
    xlim : tuple
        x-axis limits
    ylim : tuple
        y-axis limits
    log : str in ``{'x', 'y', 'xy'}``
        Which variables to log transform.
    main : str
        Plot title
    xlab : str
        x-axis label
    ylab : str
        y-axis label
    asp : str | float
        The y/x aspect ratio.
    **kwargs : dict
        Arguments passed on to the geom.

    Returns
    -------
    p: ggplot
        ggplot object
    """
    # Extract all recognizable aesthetic mappings from the parameters
    # String values e.g  "I('red')", "I(4)" are not treated as mappings

    environment = EvalEnvironment.capture(1)
    aesthetics = {} if x is None else {'x': x}
    if y is not None:
        aesthetics['y'] = y

    def is_mapping(value):
        """
        Return True if value is not enclosed in I() function
        """
        with suppress(AttributeError):
            return not (value.startswith('I(') and value.endswith(')'))
        return True

    def I(value):
        return value

    I_env = EvalEnvironment([{'I': I}])

    for ae in six.viewkeys(kwargs) & all_aesthetics:
        value = kwargs[ae]
        if is_mapping(value):
            aesthetics[ae] = value
        else:
            kwargs[ae] = I_env.eval(value)

    # List of geoms
    if is_string(geom):
        geom = [geom]
    elif isinstance(geom, tuple):
        geom = list(geom)

    if data is None:
        data = pd.DataFrame()

    # Work out plot data, and modify aesthetics, if necessary
    def replace_auto(lst, str2):
        """
        Replace all occurences of 'auto' in with str2
        """
        for i, value in enumerate(lst):
            if value == 'auto':
                lst[i] = str2
        return lst

    if 'auto' in geom:
        if 'sample' in aesthetics:
            replace_auto(geom, 'qq')
        elif y is None:
            # If x is discrete we choose geom_bar &
            # geom_histogram otherwise. But we need to
            # evaluate the mapping to find out the dtype
            env = environment.with_outer_namespace(
                {'factor': pd.Categorical})

            if isinstance(aesthetics['x'], six.string_types):
                try:
                    x = env.eval(aesthetics['x'], inner_namespace=data)
                except Exception:
                    msg = "Could not evaluate aesthetic 'x={}'"
                    raise PlotnineError(msg.format(aesthetics['x']))
            elif not hasattr(aesthetics['x'], 'dtype'):
                x = np.asarray(aesthetics['x'])

            if array_kind.discrete(x):
                replace_auto(geom, 'bar')
            else:
                replace_auto(geom, 'histogram')

        else:
            if x is None:
                if pdtypes.is_list_like(aesthetics['y']):
                    aesthetics['x'] = range(len(aesthetics['y']))
                    xlab = 'range(len(y))'
                    ylab = 'y'
                else:
                    # We could solve the issue in layer.compute_asthetics
                    # but it is not worth the extra complexity
                    raise PlotnineError(
                        "Cannot infer how long x should be.")
            replace_auto(geom, 'point')

    p = ggplot(aes(**aesthetics), data=data, environment=environment)

    def get_facet_type(facets):
        with suppress(PlotnineError):
            parse_grid_facets(facets)
            return 'grid'

        with suppress(PlotnineError):
            parse_wrap_facets(facets)
            return 'wrap'

        warn("Could not determine the type of faceting, "
             "therefore no faceting.")
        return 'null'

    if facets:
        facet_type = get_facet_type(facets)
        if facet_type == 'grid':
            p += facet_grid(facets, margins=margins)
        elif facet_type == 'wrap':
            p += facet_wrap(facets)
        else:
            p += facet_null()

    # Add geoms
    for g in geom:
        geom_name = 'geom_{}'.format(g)
        geom_klass = Registry[geom_name]
        stat_name = 'stat_{}'.format(geom_klass.DEFAULT_PARAMS['stat'])
        stat_klass = Registry[stat_name]
        # find params
        recognized = (six.viewkeys(kwargs) &
                      (six.viewkeys(geom_klass.DEFAULT_PARAMS) |
                       geom_klass.aesthetics() |
                       six.viewkeys(stat_klass.DEFAULT_PARAMS) |
                       stat_klass.aesthetics()))
        recognized = recognized - six.viewkeys(aesthetics)
        params = {ae: kwargs[ae] for ae in recognized}
        p += geom_klass(**params)

    # pd.Series objects have name attributes. In a dataframe, the
    # series have the name of the column.
    labels = {}
    for ae in scaled_aesthetics & six.viewkeys(kwargs):
        with suppress(AttributeError):
            labels[ae] = kwargs[ae].name

    with suppress(AttributeError):
        labels['x'] = xlab if xlab is not None else x.name

    with suppress(AttributeError):
        labels['y'] = ylab if ylab is not None else y.name

    if main is not None:
        labels['title'] = main

    if 'x' in log:
        p += scale_x_log10()

    if 'y' in log:
        p += scale_y_log10()

    if labels:
        p += labs(**labels)

    if asp:
        p += theme(aspect_ratio=asp)

    return p
Ejemplo n.º 28
0
    def add_categories(self,
                       new_categories: Union[pd.Index, Any, List],
                       inplace: bool = False) -> Optional["ps.Series"]:
        """
        Add new categories.

        `new_categories` will be included at the last/highest place in the
        categories and will be unused directly after this call.

        Parameters
        ----------
        new_categories : category or list-like of category
           The new categories to be included.
        inplace : bool, default False
           Whether or not to add the categories inplace or return a copy of
           this categorical with added categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series or None
            Categorical with new categories added or None if ``inplace=True``.

        Raises
        ------
        ValueError
            If the new categories include old categories or do not validate as
            categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.
        set_categories : Set the categories to the specified ones.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.add_categories('x')  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (4, object): ['a', 'b', 'c', 'x']
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in add_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        categories: List[Any]
        if is_list_like(new_categories):
            categories = list(new_categories)
        else:
            categories = [new_categories]

        if any(cat in self.categories for cat in categories):
            raise ValueError(
                "new categories must not include old categories: {{{cats}}}".
                format(cats=", ".join(
                    set(
                        str(cat) for cat in categories
                        if cat in self.categories))))

        internal = self._data._psdf._internal.with_new_spark_column(
            self._data._column_label,
            self._data.spark.column,
            field=self._data._internal.data_fields[0].copy(
                dtype=CategoricalDtype(list(self.categories) + categories,
                                       ordered=self.ordered)),
        )
        if inplace:
            self._data._psdf._update_internal_frame(internal)
            return None
        else:
            return DataFrame(internal)._psser_for(
                self._data._column_label).copy()
Ejemplo n.º 29
0
    def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
            align='left'):
        """
        Color the background ``color`` proportional to the values in each
        column.
        Excludes non-numeric data by default.

        Parameters
        ----------
        subset: IndexSlice, default None
            a valid slice for ``data`` to limit the style application to
        axis: int
        color: str or 2-tuple/list
            If a str is passed, the color is the same for both
            negative and positive numbers. If 2-tuple/list is used, the
            first element is the color_negative and the second is the
            color_positive (eg: ['#d65f5f', '#5fba7d'])
        width: float
            A number between 0 or 100. The largest value will cover ``width``
            percent of the cell's width
        align : {'left', 'zero',' mid'}, default 'left'
            - 'left' : the min value starts at the left of the cell
            - 'zero' : a value of zero is located at the center of the cell
            - 'mid' : the center of the cell is at (max-min)/2, or
              if values are all negative (positive) the zero is aligned
              at the right (left) of the cell

              .. versionadded:: 0.20.0

        Returns
        -------
        self : Styler
        """
        subset = _maybe_numeric_slice(self.data, subset)
        subset = _non_reducing_slice(subset)

        base = 'width: 10em; height: 80%;'

        if not(is_list_like(color)):
            color = [color, color]
        elif len(color) == 1:
            color = [color[0], color[0]]
        elif len(color) > 2:
            msg = ("Must pass `color` as string or a list-like"
                   " of length 2: [`color_negative`, `color_positive`]\n"
                   "(eg: color=['#d65f5f', '#5fba7d'])")
            raise ValueError(msg)

        if align == 'left':
            self.apply(self._bar_left, subset=subset, axis=axis, color=color,
                       width=width, base=base)
        elif align == 'zero':
            self.apply(self._bar_center_zero, subset=subset, axis=axis,
                       color=color, width=width, base=base)
        elif align == 'mid':
            self.apply(self._bar_center_mid, subset=subset, axis=axis,
                       color=color, width=width, base=base)
        else:
            msg = ("`align` must be one of {'left', 'zero',' mid'}")
            raise ValueError(msg)

        return self
Ejemplo n.º 30
0
    def _parse_excel(self,
                     sheetname=0,
                     header=0,
                     skiprows=None,
                     names=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     convert_float=True,
                     true_values=None,
                     false_values=None,
                     verbose=False,
                     dtype=None,
                     squeeze=False,
                     **kwds):

        skipfooter = kwds.pop('skipfooter', None)
        if skipfooter is not None:
            skip_footer = skipfooter

        _validate_header_arg(header)
        if has_index_names is not None:
            warn(
                "\nThe has_index_names argument is deprecated; index names "
                "will be automatically inferred based on index_col.\n"
                "This argmument is still necessary if reading Excel output "
                "from 0.16.2 or prior with index names.",
                FutureWarning,
                stacklevel=3)

        if 'chunksize' in kwds:
            raise NotImplementedError("chunksize keyword of read_excel "
                                      "is not implemented")

        if parse_dates is True and index_col is None:
            warn("The 'parse_dates=True' keyword of read_excel was provided"
                 " without an 'index_col' keyword value.")

        def _parse_cell(cell_contents, cell_typ):
            """converts the contents of the cell into a pandas
               appropriate object"""

            if cell_typ == XL_CELL_DATE:

                if xlrd_0_9_3:
                    # Use the newer xlrd datetime handling.
                    try:
                        cell_contents = \
                            xldate.xldate_as_datetime(cell_contents,
                                                      epoch1904)
                    except OverflowError:
                        return cell_contents
                    # Excel doesn't distinguish between dates and time,
                    # so we treat dates on the epoch as times only.
                    # Also, Excel supports 1900 and 1904 epochs.
                    year = (cell_contents.timetuple())[0:3]
                    if ((not epoch1904 and year == (1899, 12, 31))
                            or (epoch1904 and year == (1904, 1, 1))):
                        cell_contents = time(cell_contents.hour,
                                             cell_contents.minute,
                                             cell_contents.second,
                                             cell_contents.microsecond)
                else:
                    # Use the xlrd <= 0.9.2 date handling.
                    try:
                        dt = xldate.xldate_as_tuple(cell_contents, epoch1904)

                    except xldate.XLDateTooLarge:
                        return cell_contents

                    if dt[0] < MINYEAR:
                        cell_contents = time(*dt[3:])
                    else:
                        cell_contents = datetime(*dt)

            elif cell_typ == XL_CELL_ERROR:
                cell_contents = np.nan
            elif cell_typ == XL_CELL_BOOLEAN:
                cell_contents = bool(cell_contents)
            elif convert_float and cell_typ == XL_CELL_NUMBER:
                # GH5394 - Excel 'numbers' are always floats
                # it's a minimal perf hit and less suprising
                val = int(cell_contents)
                if val == cell_contents:
                    cell_contents = val
            return cell_contents

        ret_dict = False
        if isinstance(sheetname, list):
            sheets = sheetname
            ret_dict = True
        elif sheetname is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheetname]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())
        output = OrderedDict()

        import xlrd
        from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        # Keep sheetname to maintain backwards compatibility.
        for asheetname in sheets:
            if verbose:
                print("Reading sheet %s" % asheetname)
            if isinstance(asheetname, compat.string_types):
                sheet = self.book.sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.book.sheet_by_index(asheetname)

            data = []
            should_parse = {}

            if sheet.nrows > 5000:
                raise Exception(
                    "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload"
                )
            elif kwds.get('MaxTest'):
                continue

            for i in range(sheet.nrows):

                row = []
                for j, (value, typ) in enumerate(
                        zip(sheet.row_values(i), sheet.row_types(i))):
                    if parse_cols is not None and j not in should_parse:
                        should_parse[j] = self._should_parse(j, parse_cols)

                    if parse_cols is None or should_parse[j]:
                        row.append(_parse_cell(value, typ))
                data.append(row)
#            output[asheetname] = data
            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None:
                if is_list_like(header):
                    header_names = []
                    control_row = [True for x in data[0]]
                    for row in header:
                        if is_integer(skiprows):
                            row += skiprows

                        data[row], control_row = _fill_mi_header(
                            data[row], control_row)
                        header_name, data[row] = _pop_header_name(
                            data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # forward fill values for MultiIndex index
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == '' or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

            if is_list_like(header) and len(header) > 1:
                has_index_names = True

            if kwds.get('parsed'):
                try:
                    parser = TextParser(data,
                                        header=header,
                                        index_col=index_col,
                                        has_index_names=has_index_names,
                                        na_values=na_values,
                                        thousands=thousands,
                                        parse_dates=parse_dates,
                                        date_parser=date_parser,
                                        true_values=true_values,
                                        false_values=false_values,
                                        skiprows=skiprows,
                                        skipfooter=skip_footer,
                                        squeeze=squeeze,
                                        dtype=dtype,
                                        **kwds)
                    output[asheetname] = parser.read()
                    if names is not None:
                        output[asheetname].columns = names
                    if not squeeze or isinstance(output[asheetname],
                                                 DataFrame):
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                except EmptyDataError:
                    # No Data, return an empty DataFrame
                    output[asheetname] = DataFrame()
            else:
                output[asheetname] = data

        if ret_dict or kwds.get('MaxTest'):
            return output
        else:
            return output[asheetname]