コード例 #1
0
    def get_ind(sdf, ind):
        def calc_min_max():
            if len(sdf.columns) > 1:
                min_col = F.least(*map(F.min, sdf))
                max_col = F.greatest(*map(F.max, sdf))
            else:
                min_col = F.min(sdf.columns[-1])
                max_col = F.max(sdf.columns[-1])
            return sdf.select(min_col, max_col).first()

        if ind is None:
            min_val, max_val = calc_min_max()
            sample_range = max_val - min_val
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                1000,
            )
        elif is_integer(ind):
            min_val, max_val = calc_min_max()
            sample_range = max_val - min_val
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                ind,
            )
        return ind
コード例 #2
0
    def _get_ind(self, y):
        # 'y' is a Spark DataFrame that selects one column.
        if self.ind is None:
            min_val, max_val = y.select(F.min(y.columns[-1]),
                                        F.max(y.columns[-1])).first()

            sample_range = max_val - min_val
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                1000,
            )
        elif is_integer(self.ind):
            min_val, max_val = y.select(F.min(y.columns[-1]),
                                        F.max(y.columns[-1])).first()

            sample_range = np.nanmax(y) - np.nanmin(y)
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                self.ind,
            )
        else:
            ind = self.ind
        return ind
コード例 #3
0
ファイル: common.py プロジェクト: wkerzendorf/pandas
def ensure_python_int(value: int | np.integer) -> int:
    """
    Ensure that a value is a python int.

    Parameters
    ----------
    value: int or numpy.integer

    Returns
    -------
    int

    Raises
    ------
    TypeError: if the value isn't an int or can't be converted to one.
    """
    if not (is_integer(value) or is_float(value)):
        if not is_scalar(value):
            raise TypeError(
                f"Value needs to be a scalar value, was type {type(value).__name__}"
            )
        raise TypeError(f"Wrong type {type(value)} for value {value}")
    try:
        new_value = int(value)
        assert new_value == value
    except (TypeError, ValueError, AssertionError) as err:
        raise TypeError(f"Wrong type {type(value)} for value {value}") from err
    return new_value
コード例 #4
0
ファイル: core.py プロジェクト: yangrong688/spark
    def prepare_hist_data(data, bins):
        data, numeric_data = NumericPlotBase.prepare_numeric_data(data)
        if is_integer(bins):
            # computes boundaries for the column
            bins = HistogramPlotBase.get_bins(data._to_spark(), bins)

        return numeric_data, bins
コード例 #5
0
ファイル: plot.py プロジェクト: takitsuba/koalas
    def _args_adjust(self):
        if is_integer(self.bins):
            summary = KoalasHistPlotSummary(self.data, self.data.name)
            # computes boundaries for the column
            self.bins = summary.get_bins(self.bins)

        if is_list_like(self.bottom):
            self.bottom = np.array(self.bottom)
コード例 #6
0
    def _finalize_output(self, frame: DataFrame) -> DataFrame:
        """
        Processes data read in based on kwargs.

        Parameters
        ----------
        frame: DataFrame
            The DataFrame to process.

        Returns
        -------
        DataFrame
            The processed DataFrame.
        """
        num_cols = len(frame.columns)
        multi_index_named = True
        if self.header is None:
            if self.names is None:
                if self.prefix is not None:
                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
                elif self.header is None:
                    self.names = range(num_cols)
            if len(self.names) != num_cols:
                # usecols is passed through to pyarrow, we only handle index col here
                # The only way self.names is not the same length as number of cols is
                # if we have int index_col. We should just pad the names(they will get
                # removed anyways) to expected length then.
                self.names = list(
                    range(num_cols - len(self.names))) + self.names
                multi_index_named = False
            frame.columns = self.names
        # we only need the frame not the names
        # error: Incompatible types in assignment (expression has type
        # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp,
        # Timedelta, Any]]], Index]", variable has type "Index")  [assignment]
        frame.columns, frame = self._do_date_conversions(  # type: ignore[assignment]
            frame.columns, frame)
        if self.index_col is not None:
            for i, item in enumerate(self.index_col):
                if is_integer(item):
                    self.index_col[i] = frame.columns[item]
                else:
                    # String case
                    if item not in frame.columns:
                        raise ValueError(f"Index {item} invalid")
            frame.set_index(self.index_col, drop=True, inplace=True)
            # Clear names if headerless and no name given
            if self.header is None and not multi_index_named:
                frame.index.names = [None] * len(frame.index.names)

        if self.kwds.get("dtype") is not None:
            try:
                frame = frame.astype(self.kwds.get("dtype"))
            except TypeError as e:
                # GH#44901 reraise to keep api consistent
                raise ValueError(e)
        return frame
コード例 #7
0
    def _finalize_output(self, frame: DataFrame) -> DataFrame:
        """
        Processes data read in based on kwargs.

        Parameters
        ----------
        frame: DataFrame
            The DataFrame to process.

        Returns
        -------
        DataFrame
            The processed DataFrame.
        """
        num_cols = len(frame.columns)
        multi_index_named = True
        if self.header is None:
            if self.names is None:
                if self.prefix is not None:
                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
                elif self.header is None:
                    self.names = range(num_cols)
            if len(self.names) != num_cols:
                # usecols is passed through to pyarrow, we only handle index col here
                # The only way self.names is not the same length as number of cols is
                # if we have int index_col. We should just pad the names(they will get
                # removed anyways) to expected length then.
                self.names = list(
                    range(num_cols - len(self.names))) + self.names
                multi_index_named = False
            frame.columns = self.names
        # we only need the frame not the names
        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
        if self.index_col is not None:
            for i, item in enumerate(self.index_col):
                if is_integer(item):
                    self.index_col[i] = frame.columns[item]
                else:
                    # String case
                    if item not in frame.columns:
                        raise ValueError(f"Index {item} invalid")
            frame.set_index(self.index_col, drop=True, inplace=True)
            # Clear names if headerless and no name given
            if self.header is None and not multi_index_named:
                frame.index.names = [None] * len(frame.index.names)

        if self.kwds.get("dtype") is not None:
            frame = frame.astype(self.kwds.get("dtype"))
        return frame
コード例 #8
0
    def prepare_hist_data(data, bins):
        # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it.
        from pyspark.pandas.series import Series

        if isinstance(data, Series):
            data = data.to_frame()

        numeric_data = data.select_dtypes(include=[
            "byte", "decimal", "integer", "float", "long", "double",
            np.datetime64
        ])

        # no empty frames or series allowed
        if len(numeric_data.columns) == 0:
            raise TypeError("Empty {0!r}: no numeric data to "
                            "plot".format(numeric_data.__class__.__name__))

        if is_integer(bins):
            # computes boundaries for the column
            bins = HistogramPlotBase.get_bins(data.to_spark(), bins)

        return numeric_data, bins
コード例 #9
0
    def _compute_plot_data(self):
        # TODO: this logic is same with KdePlot. Might have to deduplicate it.
        from databricks.koalas.series import Series

        data = self.data
        if isinstance(data, Series):
            data = data.to_frame()

        numeric_data = data.select_dtypes(
            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
        )

        # no empty frames or series allowed
        if len(numeric_data.columns) == 0:
            raise TypeError(
                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
            )

        if is_integer(self.bins):
            # computes boundaries for the column
            self.bins = self._get_bins(data.to_spark(), self.bins)

        self.data = numeric_data
コード例 #10
0
ファイル: core.py プロジェクト: liuq4360/koalas
    def get_ind(sdf, ind):
        # 'sdf' is a Spark DataFrame that selects one column.

        if ind is None:
            min_val, max_val = sdf.select(F.min(sdf.columns[-1]),
                                          F.max(sdf.columns[-1])).first()

            sample_range = max_val - min_val
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                1000,
            )
        elif is_integer(ind):
            min_val, max_val = sdf.select(F.min(sdf.columns[-1]),
                                          F.max(sdf.columns[-1])).first()

            sample_range = min_val - max_val
            ind = np.linspace(
                min_val - 0.5 * sample_range,
                max_val + 0.5 * sample_range,
                ind,
            )
        return ind
コード例 #11
0
    def _args_adjust(self):
        from databricks.koalas.series import Series

        data = self.data
        if isinstance(data, Series):
            data = data.to_frame()

        numeric_data = data.select_dtypes(include=['byte', 'decimal', 'integer', 'float',
                                                   'long', 'double', np.datetime64])

        is_empty = not len(numeric_data.columns)

        # no empty frames or series allowed
        if is_empty:
            raise TypeError('Empty {0!r}: no numeric data to '
                            'plot'.format(numeric_data.__class__.__name__))

        if is_integer(self.bins):
            summary = KoalasHistPlotSummary(self.data, self.data.name)
            # computes boundaries for the column
            self.bins = summary.get_bins(self.bins)

        if is_list_like(self.bottom):
            self.bottom = np.array(self.bottom)