def hysteresis_class(x: pd.Series, y: pd.Series, x_fixed: pd.Series):
    """
    Finds hysteresis class for x and y
    """
    if x.isna().any() or y.isna().any():
        print(
            "x or y contain nan, cannot determine hysteresis class, returning nan"
        )
        diff_area, h, hyst_class = [
            pd.Series(np.nan, index=x_fixed.index), np.nan, np.nan
        ]
        return diff_area, h, hyst_class

    x_norm = normalize(x)
    y_norm = normalize(y)
    x_rise_h, x_rise_l, x_fall_h, x_fall_l = find_independent_indices(
        x_fixed, x_norm)
    check_for_error(x_fall_l, x_rise_l)
    y_fixed_rise, y_fixed_fall = y_for_x_fixed(x_rise_h, x_rise_l, x_fall_h,
                                               x_fall_l, y_norm, x_norm,
                                               x_fixed)

    rise_area, fall_area, diff_area, h = area_hysteresis_index(
        x_fixed, y_fixed_rise, y_fixed_fall)

    min_dA = min(diff_area)
    max_dA = max(diff_area)
    h = h if np.isfinite(h) else 0
    hyst_class = find_hysteresis_class(x, y, min_dA, max_dA, h)

    return diff_area, h, hyst_class
Example #2
0
def parse_integer(
    x: pd.Series, bareNumber: bool = True
) -> Union[pd.Series, ValueTypeError]:
    """
    Parse strings as integers.

    Arguments:
        x: Strings.
        bareNumber: Whether the numbers are bare, or padded with non-numeric text.

    Returns:
        Either parsed integers or a parsing error.
    """
    if OPTIONS.raise_first_invalid_integer:
        isna = x.isna()
        try:
            x = x.where(isna, x[~isna].astype(int))
            return x.astype("Int64")
        except ValueError as e:
            return ValueTypeError(fieldType="integer", note=str(e))
    parsed = x
    if bareNumber:
        parsed = parsed.apply(_parse_integer, convert_dtype=False)
    else:
        parsed = parsed.apply(_extract_integer, convert_dtype=False)
    invalid = ~x.isna() & parsed.isna()
    if invalid.any():
        invalids = x[invalid].unique().tolist()
        return ValueTypeError(fieldType="integer", values=invalids)
    return parsed.astype("Int64")
Example #3
0
    def test_isna(self):
        ser = Series([0, 5.4, 3, nan, -0.001])
        expected = Series([False, False, False, True, False])
        tm.assert_series_equal(ser.isna(), expected)

        ser = Series(["hi", "", nan])
        expected = Series([False, False, True])
        tm.assert_series_equal(ser.isna(), expected)
Example #4
0
    def test_isna(self):
        ser = Series([0, 5.4, 3, nan, -0.001])
        expected = Series([False, False, False, True, False])
        tm.assert_series_equal(ser.isna(), expected)

        ser = Series(["hi", "", nan])
        expected = Series([False, False, True])
        tm.assert_series_equal(ser.isna(), expected)
Example #5
0
 def get_max(s: pd.Series):
     ps = s.index
     m = s.max()
     if isinstance(ps[0], str):
         s = (s == m).astype(int).replace(0, np.nan)
         s[~s.isna()] = ps[~s.isna()]
         return s
     else:
         return (s == m).astype(int).replace(0, np.nan) * ps
def test_info_categorical_column_just_works():
    n = 2500
    data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
    s = Series(data).astype("category")
    s.isna()
    buf = StringIO()
    s.info(buf=buf)

    s2 = s[s == "d"]
    buf = StringIO()
    s2.info(buf=buf)
    def from_diff(cls, in_series: pd.Series, out_series: pd.Series) -> "ErrorCount":
        in_na = in_series.isna()
        out_na = out_series.isna()
        out_errors = out_na.index[out_na & ~in_na]

        if out_errors.empty:
            return ErrorCount()
        else:
            column = in_series.name
            row = int(out_errors[0])  # np.int64 => int
            value = in_series[row]  # always str
            return ErrorCount(column, row, value, len(out_errors), 1)
    def from_diff(cls, in_series: pd.Series, out_series: pd.Series) -> "ErrorCount":
        in_na = in_series.isna()
        out_na = out_series.isna()
        out_errors = out_na.index[out_na & ~in_na]

        if out_errors.empty:
            return ErrorCount()
        else:
            column = in_series.name
            row = out_errors[0]
            value = in_series[row]
            return ErrorCount(column, int(row), str(value), len(out_errors), 1)
Example #9
0
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
    # GH 25871: Fix groupby sorting on ordered Categoricals
    # GH 25167: Groupby with observed=True doesn't sort

    # Build a dataframe with cat having one unobserved category ('missing'),
    # and a Series with identical values
    label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'],
                        categories=['a', 'b', 'missing', 'd'],
                        ordered=ordered)
    val = Series(['d', 'a', 'b', 'a', 'd', 'b'])
    df = DataFrame({'label': label, 'val': val})

    # aggregate on the Categorical
    result = (df.groupby('label', observed=observed, sort=sort)['val']
                .aggregate('first'))

    # If ordering works, we expect index labels equal to aggregation results,
    # except for 'observed=False': label 'missing' has aggregation None
    label = Series(result.index.array, dtype='object')
    aggr = Series(result.array)
    if not observed:
        aggr[aggr.isna()] = 'missing'
    if not all(label == aggr):
        msg = ('Labels and aggregation results not consistently sorted\n' +
               'for (ordered={}, observed={}, sort={})\n' +
               'Result:\n{}').format(ordered, observed, sort, result)
        assert False, msg
Example #10
0
def parse_datetime(
    x: pd.Series, format: str = "default"
) -> Union[pd.Series, ValueTypeError]:
    """
    Parse strings as datetimes.

    Because :class:`pd.Timestamp` is used, dates are limited to the range 1677 - 2262
    (https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations).

    Arguments:
        x: Strings.
        format: Either 'default' (ISO8601: `YYY-MM-DDThh:mm:ssZ`), 'any' (guess),
            or a pattern compatible with :meth:`datetime.datetime.strptime`.

    Returns:
        Either parsed datetimes (as :class:`pd.Timestamp`) or a parsing error.
    """
    patterns = {"default": "%Y-%m-%dT%H:%M:%S%z", "any": None}
    pattern = patterns.get(format, format)
    parsed = pd.to_datetime(
        x, errors="coerce", format=pattern, infer_datetime_format=pattern is None
    )
    invalid = ~x.isna() & parsed.isna()
    if invalid.any():
        invalids = x[invalid].unique().tolist()
        return ValueTypeError(fieldType="datetime", fieldFormat=format, values=invalids)
    return parsed
Example #11
0
def parse_boolean(
    x: pd.Series,
    trueValues: Iterable[str] = ("true", "True", "TRUE", "1"),
    falseValues: Iterable[str] = ("false", "False", "FALSE", "0"),
) -> Union[pd.Series, ValueTypeError]:
    """
    Parse strings as boolean.

    Arguments:
        x: Strings.
        trueValues: Strings representing `False`.
        falseValues: Strings representing `True`.

    Returns:
        Either parsed boolean (as :class:`pd.Int64Dtype`) or a parsing error.
    """
    true = x.isin(trueValues)
    false = x.isin(falseValues)
    na = x.isna()
    invalid = ~(true | false | na)
    if invalid.any():
        invalids = x[invalid].unique().tolist()
        return ValueTypeError(fieldType="boolean", values=invalids)
    x = true.astype(int).astype("Int64")
    x[na] = np.nan
    return x
Example #12
0
 def encode(self, xs: pd.Series):
     if get_config('keep_original'):
         self.mask = xs.isna()
     if self.by is not None:
         return xs.fillna(getattr(xs, self.by)())
     else:
         return xs.fillna(self.value)
Example #13
0
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
    # GH 25871: Fix groupby sorting on ordered Categoricals
    # GH 25167: Groupby with observed=True doesn't sort

    # Build a dataframe with cat having one unobserved category ('missing'),
    # and a Series with identical values
    label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'],
                        categories=['a', 'b', 'missing', 'd'],
                        ordered=ordered)
    val = Series(['d', 'a', 'b', 'a', 'd', 'b'])
    df = DataFrame({'label': label, 'val': val})

    # aggregate on the Categorical
    result = (df.groupby('label', observed=observed, sort=sort)['val']
                .aggregate('first'))

    # If ordering works, we expect index labels equal to aggregation results,
    # except for 'observed=False': label 'missing' has aggregation None
    label = Series(result.index.array, dtype='object')
    aggr = Series(result.array)
    if not observed:
        aggr[aggr.isna()] = 'missing'
    if not all(label == aggr):
        msg = ('Labels and aggregation results not consistently sorted\n' +
               'for (ordered={}, observed={}, sort={})\n' +
               'Result:\n{}').format(ordered, observed, sort, result)
        assert False, msg
Example #14
0
def sanitize_series(series: pd.Series) -> pd.Series:
    """
    Build a Series conforming to Workbench data types.

    The return value is a valid argument to `hash_pandas_object()` and can be
    written to a parquet file.

    Specific fixes:

    * Convert unsupported dtypes to string.
    """
    if hasattr(series, 'cat'):
        categories = series.cat.categories
        if pd.api.types.is_numeric_dtype(categories):
            # Un-categorize: make array of int/float
            return pd.to_numeric(series)
        elif categories.dtype != object \
                or pd.api.types.infer_dtype(categories) != 'string':
            # Cast non-Strings to String
            series = series.cat.rename_categories(categories.astype(str))

        series = series.cat.remove_unused_categories()
        return series
    elif is_numeric_dtype(series.dtype):
        return series
    elif is_datetime64_dtype(series.dtype):
        return series
    else:
        # convert all non-NA to str
        ret = series.astype(str)
        ret[series.isna()] = np.nan
        return ret
Example #15
0
def test_getitem_boolean_empty():
    s = Series([], dtype=np.int64)
    s.index.name = 'index_name'
    s = s[s.isna()]
    assert s.index.name == 'index_name'
    assert s.dtype == np.int64

    # GH5877
    # indexing with empty series
    s = Series(['A', 'B'])
    expected = Series(np.nan, index=['C'], dtype=object)
    result = s[Series(['C'], dtype=object)]
    assert_series_equal(result, expected)

    s = Series(['A', 'B'])
    expected = Series(dtype=object, index=Index([], dtype='int64'))
    result = s[Series([], dtype=object)]
    assert_series_equal(result, expected)

    # invalid because of the boolean indexer
    # that's empty or not-aligned
    msg = (r"Unalignable boolean Series provided as indexer \(index of"
           r" the boolean Series and of the indexed object do not match")
    with pytest.raises(IndexingError, match=msg):
        s[Series([], dtype=bool)]

    with pytest.raises(IndexingError, match=msg):
        s[Series([True], dtype=bool)]
Example #16
0
def parse_geopoint(
    x: pd.Series, format: Literal["default", "array", "object"] = "default"
) -> Union[pd.Series, ValueTypeError]:
    """
    Parse strings as geopoints.

    Per XML Schema, permits negative years and years greater than 9999.
    However, time zones are not supported
    (https://www.w3.org/TR/xmlschema-2/#timeZonePermited).

    Arguments:
        x: Strings.
        format: Either 'default' ('<lon>,<lat>' or '<lon>, <lat>'),
            'array' ('[<lon>, <lat>]', whitespace-insensitive), or
            'object' ('{"lon": <lon>, "lat": <lat>}' or '{"lat": <lat>, "lon": <lon>}',
            whitespace-insensitive),
            where `<lon>` and `<lat>` are any values accepted by :class:`float`.

    Returns:
        Either parsed geopoints (as :class:`tuple`: lon, lat) or a parsing error.
    """
    mask = ~x.isna()
    functions = {
        "default": _extract_geopoint_default,
        "array": _extract_geopoint_array,
        "object": _extract_geopoint_object,
    }
    parsed = x[mask].apply(functions[format])
    invalid = parsed.isna()
    if invalid.any():
        invalids = x[mask][invalid].unique().tolist()
        return ValueTypeError(fieldType="geopoint", fieldFormat=format, values=invalids)
    return parsed.reindex_like(x)
Example #17
0
def validate_y(y: pd.Series) -> None:
    """
    Validates if input response variable is correct and doesn't contain invalid input.

    Args
    ----
      y: pd.Series
          Response variable sent in input data in first column.

    Raises
    ------
      ValueError: if values in `y` are Null.
                  if less than 3 (three) non-null values in `y` (as in this case
                      we can't even train a model).
                  if `y` is constant (in this case it doesn't make much sense to
                    make predictions as the time series doesn't change in the
                    training phase.
    """
    if np.all(y.isna()):
        raise ValueError('Input response cannot have just Null values.')
    if y.notna().values.sum() < 3:
        raise ValueError('Input response must have more than 3 non-null '
                         'points at least.')
    if y.std(skipna=True, ddof=0) == 0:
        raise ValueError('Input response cannot be constant.')
Example #18
0
 def _predict_core(self, s: pd.Series) -> pd.Series:
     predicted = (s > (self.high if
                       (self.high is not None) else float("inf"))) | (
                           s < (self.low if
                                (self.low is not None) else -float("inf")))
     predicted[s.isna()] = np.nan
     return predicted
Example #19
0
def test_nanops_independent_of_mask_param(operation):
    # GH22764
    s = Series([1, 2, np.nan, 3, np.nan, 4])
    mask = s.isna()
    median_expected = operation(s)
    median_result = operation(s, mask=mask)
    assert median_expected == median_result
Example #20
0
def test_getitem_boolean_empty():
    s = Series([], dtype=np.int64)
    s.index.name = 'index_name'
    s = s[s.isna()]
    assert s.index.name == 'index_name'
    assert s.dtype == np.int64

    # GH5877
    # indexing with empty series
    s = Series(['A', 'B'])
    expected = Series(np.nan, index=['C'], dtype=object)
    result = s[Series(['C'], dtype=object)]
    assert_series_equal(result, expected)

    s = Series(['A', 'B'])
    expected = Series(dtype=object, index=Index([], dtype='int64'))
    result = s[Series([], dtype=object)]
    assert_series_equal(result, expected)

    # invalid because of the boolean indexer
    # that's empty or not-aligned
    def f():
        s[Series([], dtype=bool)]

    pytest.raises(IndexingError, f)

    def f():
        s[Series([True], dtype=bool)]

    pytest.raises(IndexingError, f)
Example #21
0
def _get_pdf(series: pd.Series):
    s = series.loc[series.notna()]
    return pd.concat([
        series.loc[series.isna()],
        pd.Series(stats.norm.pdf(s), index=s.index)
    ],
                     verify_integrity=True)
Example #22
0
    def test_getitem_boolean_empty(self):
        ser = Series([], dtype=np.int64)
        ser.index.name = "index_name"
        ser = ser[ser.isna()]
        assert ser.index.name == "index_name"
        assert ser.dtype == np.int64

        # GH#5877
        # indexing with empty series
        ser = Series(["A", "B"])
        expected = Series(dtype=object, index=Index([], dtype="int64"))
        result = ser[Series([], dtype=object)]
        tm.assert_series_equal(result, expected)

        # invalid because of the boolean indexer
        # that's empty or not-aligned
        msg = (
            r"Unalignable boolean Series provided as indexer \(index of "
            r"the boolean Series and of the indexed object do not match"
        )
        with pytest.raises(IndexingError, match=msg):
            ser[Series([], dtype=bool)]

        with pytest.raises(IndexingError, match=msg):
            ser[Series([True], dtype=bool)]
Example #23
0
 def test_isna(self):
     # GH 13737
     s = Series(
         [pd.Period('2011-01', freq='M'),
          pd.Period('NaT', freq='M')])
     tm.assert_series_equal(s.isna(), Series([False, True]))
     tm.assert_series_equal(s.notna(), Series([True, False]))
Example #24
0
def test_getitem_boolean_empty():
    s = Series([], dtype=np.int64)
    s.index.name = 'index_name'
    s = s[s.isna()]
    assert s.index.name == 'index_name'
    assert s.dtype == np.int64

    # GH5877
    # indexing with empty series
    s = Series(['A', 'B'])
    expected = Series(np.nan, index=['C'], dtype=object)
    result = s[Series(['C'], dtype=object)]
    assert_series_equal(result, expected)

    s = Series(['A', 'B'])
    expected = Series(dtype=object, index=Index([], dtype='int64'))
    result = s[Series([], dtype=object)]
    assert_series_equal(result, expected)

    # invalid because of the boolean indexer
    # that's empty or not-aligned
    def f():
        s[Series([], dtype=bool)]

    pytest.raises(IndexingError, f)

    def f():
        s[Series([True], dtype=bool)]

    pytest.raises(IndexingError, f)
Example #25
0
    def create_model(self, prior: R.SdPrior, data: pd.Series):
        """
        Args:
          prior: an R.SdPrior object describing the prior distribution on the
            residual variance paramter.
          data:  The time series of observations as a Pandas Series.

        Returns:
          A boom.StateSpaceModel object.
        """
        boom_data = boom.Vector(data.values)
        is_observed = ~data.isna()
        self._model = boom.StateSpaceModel(boom_data, is_observed)

        if prior is None:
            sdy = np.std(data)
            prior = R.SdPrior(sigma_guess=sdy, upper_limit=sdy * 1.2)

        boom_prior = boom.ChisqModel(prior.sample_size, prior.sigma_guess)
        observation_model_sampler = boom.ZeroMeanGaussianConjSampler(
            self._model.observation_model,
            boom_prior)
        observation_model_sampler.set_sigma_upper_limit(
            prior.upper_limit)
        self._model.observation_model.set_method(observation_model_sampler)

        sampler = boom.StateSpacePosteriorSampler(
            self._model, boom.GlobalRng.rng)
        self._model.set_method(sampler)

        self._original_series = data

        return self._model
Example #26
0
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
    # GH 25871: Fix groupby sorting on ordered Categoricals
    # GH 25167: Groupby with observed=True doesn't sort

    # Build a dataframe with cat having one unobserved category ('missing'),
    # and a Series with identical values
    label = Categorical(
        ["d", "a", "b", "a", "d", "b"],
        categories=["a", "b", "missing", "d"],
        ordered=ordered,
    )
    val = Series(["d", "a", "b", "a", "d", "b"])
    df = DataFrame({"label": label, "val": val})

    # aggregate on the Categorical
    result = df.groupby("label", observed=observed,
                        sort=sort)["val"].aggregate("first")

    # If ordering works, we expect index labels equal to aggregation results,
    # except for 'observed=False': label 'missing' has aggregation None
    label = Series(result.index.array, dtype="object")
    aggr = Series(result.array)
    if not observed:
        aggr[aggr.isna()] = "missing"
    if not all(label == aggr):
        msg = ("Labels and aggregation results not consistently sorted\n"
               f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
               f"Result:\n{result}")
        assert False, msg
Example #27
0
def process_labels(labels: pd.Series, delim="|"):
    if labels is not None and type(labels) is pd.Series:
        if labels.isna().any():
            labels.fillna("None", inplace=True)
        if labels.dtype == "object" and labels.str.contains(delim).any():
            labels = labels.str.split(delim, expand=True)[0].astype(str)
    return labels
Example #28
0
 def test_isna(self):
     # GH 13737
     s = Series(
         [pd.Period("2011-01", freq="M"),
          pd.Period("NaT", freq="M")])
     tm.assert_series_equal(s.isna(), Series([False, True]))
     tm.assert_series_equal(s.notna(), Series([True, False]))
Example #29
0
def test_getitem_boolean_empty():
    s = Series([], dtype=np.int64)
    s.index.name = 'index_name'
    s = s[s.isna()]
    assert s.index.name == 'index_name'
    assert s.dtype == np.int64

    # GH5877
    # indexing with empty series
    s = Series(['A', 'B'])
    expected = Series(np.nan, index=['C'], dtype=object)
    result = s[Series(['C'], dtype=object)]
    assert_series_equal(result, expected)

    s = Series(['A', 'B'])
    expected = Series(dtype=object, index=Index([], dtype='int64'))
    result = s[Series([], dtype=object)]
    assert_series_equal(result, expected)

    # invalid because of the boolean indexer
    # that's empty or not-aligned
    msg = (r"Unalignable boolean Series provided as indexer \(index of"
           r" the boolean Series and of the indexed object do not match")
    with pytest.raises(IndexingError, match=msg):
        s[Series([], dtype=bool)]

    with pytest.raises(IndexingError, match=msg):
        s[Series([True], dtype=bool)]
Example #30
0
    def get_errors(self, series: pd.Series, column: 'column.Column'):

        errors = []

        # Calculate which columns are valid using the child class's validate function, skipping empty entries if the
        # column specifies to do so
        simple_validation = ~self.validate(series)
        if column.allow_empty:
            # Failing results are those that are not empty, and fail the validation
            if np.issubdtype(series.dtype, np.number):
                validated = ~series.isna() & simple_validation
            else:
                validated = (series.str.len() > 0) & simple_validation
        else:
            validated = simple_validation

        # Cut down the original series to only ones that failed the validation
        indices = series.index[validated]

        # Use these indices to find the failing items. Also print the index which is probably a row number
        for i in indices:
            element = series[i]
            errors.append(
                ValidationWarning(message=self.message,
                                  value=element,
                                  row=i,
                                  column=series.name))

        return errors
def disorder_mapping(X: Series,
                     y: Series,
                     bad_y: Any = 1,
                     null_value: List = None,
                     flag: int = 0) -> Dict:
    """
    无序变量转码
    :param X: 变量数据
    :param y: y标签数据
    :param bad_y: 坏样本值
    :param null_value: 缺失值
    :param flag: 是否考虑缺失值
    :return:
    """
    if flag == 1:
        mask = (X.isin(null_value) | X.isna())
        X = X[~mask]
        y = y[~mask]
    else:
        mask = X.isna()
        X = X[~mask]
        y = y[~mask]
    B = (y == bad_y).sum()
    G = y.size - B
    unique_value = X.unique()
    mask = (unique_value.reshape(-1, 1) == X.values)
    mask_bad = mask & (y.values == bad_y)
    b = mask_bad.sum(axis=1)
    g = mask.sum(axis=1) - b
    woe_value = np.around(woe_single_all(B, G, b, g), 6)
    woe_value_sort = np.argsort(woe_value)
    X = X.map(dict(zip(unique_value, woe_value_sort)))
    tree = DecisionTreeClassifier(max_leaf_nodes=6,
                                  min_samples_leaf=max(int(X.size * 0.05), 50))
    tree.fit(X.values.reshape(-1, 1), y)
    threshold = [-inf]
    threshold.extend(
        np.sort(tree.tree_.threshold[tree.tree_.feature == 0]).tolist())
    threshold.append(inf)
    index = pd.cut(woe_value_sort,
                   threshold,
                   right=True,
                   include_lowest=True,
                   labels=False)
    res = dict(zip(unique_value.tolist(), index.tolist()))

    return res
Example #32
0
def slim_col(
    x: pd.Series,
    nan_replacements: Tuple[int, str] = (-99, "blank"),
    inplace: bool = False,
) -> pd.Series:
    """
    Type fit function to slim a specific column.

    :param x: data column
    :param nan_replacements: column nan replacements, including numeric nan and category nan values, i.e. -99 or 'blank'
    :param inplace: inplace change or not (default is False)
    :return: slimed column
    """
    num_nan, cat_nan = nan_replacements
    origin_type = x.dtype.name
    x_has_nan = x.isna().values.any()  # bool: series中发现nan -> True
    if is_numeric_dtype(x):
        if x_has_nan:
            if inplace:
                x.fillna(value=num_nan, inplace=inplace)
            else:
                x = x.fillna(value=num_nan, inplace=inplace)
        as_int = x.fillna(0).astype(np.int64)  # 若inplace=False则已经填充过了-99
        is_int = np.allclose(x, as_int)
        mn, mx = x.min(), x.max()
        if is_int:
            if mn >= 0:
                if mx < 255:
                    x = x.astype(np.uint8)
                elif mx < 65535:
                    x = x.astype(np.uint16)
                elif mx < 4294967295:
                    x = x.astype(np.uint32)
                else:
                    x = x.astype(np.uint64)
            else:
                if -128 < mn and mx < 127:
                    x = x.astype(np.int8)
                elif -32768 < mn and mx < 32767:
                    x = x.astype(np.int16)
                elif -2147483648 < mn and mx < 2147483647:
                    x = x.astype(np.int32)
                elif -9223372036854775808 < mn and mx < 9223372036854775807:
                    x = x.astype(np.int64)
                else:
                    raise OverflowError("Integer overflow encountered!")

    # 若为分类型, 则填充为'blank'并将列类型设置为category,Converting string variable to a categorical variable will save memory
    elif is_string_dtype(x) or is_categorical_dtype(x) or is_object_dtype(x):
        if x_has_nan:
            if inplace:
                x.fillna(value=cat_nan, inplace=inplace)
            else:
                x = x.fillna(value=cat_nan, inplace=inplace)
        x = x.astype("category")

    logger.info("Column {} data type changes from `{}` to `{}`.".format(
        repr(x.name), origin_type, x.dtype.name))
    return x
 def __init__(self, column: pd.Series):
     self.data = column
     self.name = str(column.name)
     self.type = self.get_type()
     self.count = column.size
     self.count_distinct = column.nunique(dropna=False)
     self.count_null = column.isna().sum()
     self.max_groups_allowed = 20  # for group by operations
Example #34
0
def get_shift_digit(s: pd.Series) -> pd.Series:
    """
    Возвращает колонку с изменениями значений
    Для вызова этой функции необходимо очистить колонки Close от NaN
    """
    assert not s.isna().any(), "Необходимо очистить колонку от NaN"

    return (s.shift(-1) - s).shift(1)
Example #35
0
 def test_isna_for_inf(self):
     s = Series(['a', np.inf, np.nan, 1.0])
     with pd.option_context('mode.use_inf_as_na', True):
         r = s.isna()
         dr = s.dropna()
     e = Series([False, True, True, False])
     de = Series(['a', 1.0], index=[0, 3])
     tm.assert_series_equal(r, e)
     tm.assert_series_equal(dr, de)
Example #36
0
    def test_constructor_dtype_str_na_values(self, string_dtype):
        # https://github.com/pandas-dev/pandas/issues/21083
        ser = Series(['x', None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(['x', np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])
Example #37
0
    def test_bool_operators_with_nas(self, bool_op):
        # boolean &, |, ^ should work with object arrays and propagate NAs
        ser = Series(bdate_range('1/1/2000', periods=10), dtype=object)
        ser[::2] = np.nan

        mask = ser.isna()
        filled = ser.fillna(ser[0])

        result = bool_op(ser < ser[9], ser > ser[3])

        expected = bool_op(filled < filled[9], filled > filled[3])
        expected[mask] = False
        assert_series_equal(result, expected)
Example #38
0
    def test_isnull_for_inf_deprecated(self):
        # gh-17115
        s = Series(['a', np.inf, np.nan, 1.0])
        with tm.assert_produces_warning(DeprecationWarning,
                                        check_stacklevel=False):
            pd.set_option('mode.use_inf_as_null', True)
            r = s.isna()
            dr = s.dropna()
            pd.reset_option('mode.use_inf_as_null')

        e = Series([False, True, True, False])
        de = Series(['a', 1.0], index=[0, 3])
        tm.assert_series_equal(r, e)
        tm.assert_series_equal(dr, de)
Example #39
0
 def test_isna(self):
     ser = Series([0, 5.4, 3, nan, -0.001])
     np.array_equal(ser.isna(),
                    Series([False, False, False, True, False]).values)
     ser = Series(["hi", "", nan])
     np.array_equal(ser.isna(), Series([False, False, True]).values)
Example #40
0
 def test_isna(self):
     # GH 13737
     s = Series([pd.Period('2011-01', freq='M'),
                 pd.Period('NaT', freq='M')])
     tm.assert_series_equal(s.isna(), Series([False, True]))
     tm.assert_series_equal(s.notna(), Series([True, False]))