def hysteresis_class(x: pd.Series, y: pd.Series, x_fixed: pd.Series): """ Finds hysteresis class for x and y """ if x.isna().any() or y.isna().any(): print( "x or y contain nan, cannot determine hysteresis class, returning nan" ) diff_area, h, hyst_class = [ pd.Series(np.nan, index=x_fixed.index), np.nan, np.nan ] return diff_area, h, hyst_class x_norm = normalize(x) y_norm = normalize(y) x_rise_h, x_rise_l, x_fall_h, x_fall_l = find_independent_indices( x_fixed, x_norm) check_for_error(x_fall_l, x_rise_l) y_fixed_rise, y_fixed_fall = y_for_x_fixed(x_rise_h, x_rise_l, x_fall_h, x_fall_l, y_norm, x_norm, x_fixed) rise_area, fall_area, diff_area, h = area_hysteresis_index( x_fixed, y_fixed_rise, y_fixed_fall) min_dA = min(diff_area) max_dA = max(diff_area) h = h if np.isfinite(h) else 0 hyst_class = find_hysteresis_class(x, y, min_dA, max_dA, h) return diff_area, h, hyst_class
def parse_integer( x: pd.Series, bareNumber: bool = True ) -> Union[pd.Series, ValueTypeError]: """ Parse strings as integers. Arguments: x: Strings. bareNumber: Whether the numbers are bare, or padded with non-numeric text. Returns: Either parsed integers or a parsing error. """ if OPTIONS.raise_first_invalid_integer: isna = x.isna() try: x = x.where(isna, x[~isna].astype(int)) return x.astype("Int64") except ValueError as e: return ValueTypeError(fieldType="integer", note=str(e)) parsed = x if bareNumber: parsed = parsed.apply(_parse_integer, convert_dtype=False) else: parsed = parsed.apply(_extract_integer, convert_dtype=False) invalid = ~x.isna() & parsed.isna() if invalid.any(): invalids = x[invalid].unique().tolist() return ValueTypeError(fieldType="integer", values=invalids) return parsed.astype("Int64")
def test_isna(self): ser = Series([0, 5.4, 3, nan, -0.001]) expected = Series([False, False, False, True, False]) tm.assert_series_equal(ser.isna(), expected) ser = Series(["hi", "", nan]) expected = Series([False, False, True]) tm.assert_series_equal(ser.isna(), expected)
def get_max(s: pd.Series): ps = s.index m = s.max() if isinstance(ps[0], str): s = (s == m).astype(int).replace(0, np.nan) s[~s.isna()] = ps[~s.isna()] return s else: return (s == m).astype(int).replace(0, np.nan) * ps
def test_info_categorical_column_just_works(): n = 2500 data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) s = Series(data).astype("category") s.isna() buf = StringIO() s.info(buf=buf) s2 = s[s == "d"] buf = StringIO() s2.info(buf=buf)
def from_diff(cls, in_series: pd.Series, out_series: pd.Series) -> "ErrorCount": in_na = in_series.isna() out_na = out_series.isna() out_errors = out_na.index[out_na & ~in_na] if out_errors.empty: return ErrorCount() else: column = in_series.name row = int(out_errors[0]) # np.int64 => int value = in_series[row] # always str return ErrorCount(column, row, value, len(out_errors), 1)
def from_diff(cls, in_series: pd.Series, out_series: pd.Series) -> "ErrorCount": in_na = in_series.isna() out_na = out_series.isna() out_errors = out_na.index[out_na & ~in_na] if out_errors.empty: return ErrorCount() else: column = in_series.name row = out_errors[0] value = in_series[row] return ErrorCount(column, int(row), str(value), len(out_errors), 1)
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], categories=['a', 'b', 'missing', 'd'], ordered=ordered) val = Series(['d', 'a', 'b', 'a', 'd', 'b']) df = DataFrame({'label': label, 'val': val}) # aggregate on the Categorical result = (df.groupby('label', observed=observed, sort=sort)['val'] .aggregate('first')) # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None label = Series(result.index.array, dtype='object') aggr = Series(result.array) if not observed: aggr[aggr.isna()] = 'missing' if not all(label == aggr): msg = ('Labels and aggregation results not consistently sorted\n' + 'for (ordered={}, observed={}, sort={})\n' + 'Result:\n{}').format(ordered, observed, sort, result) assert False, msg
def parse_datetime( x: pd.Series, format: str = "default" ) -> Union[pd.Series, ValueTypeError]: """ Parse strings as datetimes. Because :class:`pd.Timestamp` is used, dates are limited to the range 1677 - 2262 (https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations). Arguments: x: Strings. format: Either 'default' (ISO8601: `YYY-MM-DDThh:mm:ssZ`), 'any' (guess), or a pattern compatible with :meth:`datetime.datetime.strptime`. Returns: Either parsed datetimes (as :class:`pd.Timestamp`) or a parsing error. """ patterns = {"default": "%Y-%m-%dT%H:%M:%S%z", "any": None} pattern = patterns.get(format, format) parsed = pd.to_datetime( x, errors="coerce", format=pattern, infer_datetime_format=pattern is None ) invalid = ~x.isna() & parsed.isna() if invalid.any(): invalids = x[invalid].unique().tolist() return ValueTypeError(fieldType="datetime", fieldFormat=format, values=invalids) return parsed
def parse_boolean( x: pd.Series, trueValues: Iterable[str] = ("true", "True", "TRUE", "1"), falseValues: Iterable[str] = ("false", "False", "FALSE", "0"), ) -> Union[pd.Series, ValueTypeError]: """ Parse strings as boolean. Arguments: x: Strings. trueValues: Strings representing `False`. falseValues: Strings representing `True`. Returns: Either parsed boolean (as :class:`pd.Int64Dtype`) or a parsing error. """ true = x.isin(trueValues) false = x.isin(falseValues) na = x.isna() invalid = ~(true | false | na) if invalid.any(): invalids = x[invalid].unique().tolist() return ValueTypeError(fieldType="boolean", values=invalids) x = true.astype(int).astype("Int64") x[na] = np.nan return x
def encode(self, xs: pd.Series): if get_config('keep_original'): self.mask = xs.isna() if self.by is not None: return xs.fillna(getattr(xs, self.by)()) else: return xs.fillna(self.value)
def sanitize_series(series: pd.Series) -> pd.Series: """ Build a Series conforming to Workbench data types. The return value is a valid argument to `hash_pandas_object()` and can be written to a parquet file. Specific fixes: * Convert unsupported dtypes to string. """ if hasattr(series, 'cat'): categories = series.cat.categories if pd.api.types.is_numeric_dtype(categories): # Un-categorize: make array of int/float return pd.to_numeric(series) elif categories.dtype != object \ or pd.api.types.infer_dtype(categories) != 'string': # Cast non-Strings to String series = series.cat.rename_categories(categories.astype(str)) series = series.cat.remove_unused_categories() return series elif is_numeric_dtype(series.dtype): return series elif is_datetime64_dtype(series.dtype): return series else: # convert all non-NA to str ret = series.astype(str) ret[series.isna()] = np.nan return ret
def test_getitem_boolean_empty(): s = Series([], dtype=np.int64) s.index.name = 'index_name' s = s[s.isna()] assert s.index.name == 'index_name' assert s.dtype == np.int64 # GH5877 # indexing with empty series s = Series(['A', 'B']) expected = Series(np.nan, index=['C'], dtype=object) result = s[Series(['C'], dtype=object)] assert_series_equal(result, expected) s = Series(['A', 'B']) expected = Series(dtype=object, index=Index([], dtype='int64')) result = s[Series([], dtype=object)] assert_series_equal(result, expected) # invalid because of the boolean indexer # that's empty or not-aligned msg = (r"Unalignable boolean Series provided as indexer \(index of" r" the boolean Series and of the indexed object do not match") with pytest.raises(IndexingError, match=msg): s[Series([], dtype=bool)] with pytest.raises(IndexingError, match=msg): s[Series([True], dtype=bool)]
def parse_geopoint( x: pd.Series, format: Literal["default", "array", "object"] = "default" ) -> Union[pd.Series, ValueTypeError]: """ Parse strings as geopoints. Per XML Schema, permits negative years and years greater than 9999. However, time zones are not supported (https://www.w3.org/TR/xmlschema-2/#timeZonePermited). Arguments: x: Strings. format: Either 'default' ('<lon>,<lat>' or '<lon>, <lat>'), 'array' ('[<lon>, <lat>]', whitespace-insensitive), or 'object' ('{"lon": <lon>, "lat": <lat>}' or '{"lat": <lat>, "lon": <lon>}', whitespace-insensitive), where `<lon>` and `<lat>` are any values accepted by :class:`float`. Returns: Either parsed geopoints (as :class:`tuple`: lon, lat) or a parsing error. """ mask = ~x.isna() functions = { "default": _extract_geopoint_default, "array": _extract_geopoint_array, "object": _extract_geopoint_object, } parsed = x[mask].apply(functions[format]) invalid = parsed.isna() if invalid.any(): invalids = x[mask][invalid].unique().tolist() return ValueTypeError(fieldType="geopoint", fieldFormat=format, values=invalids) return parsed.reindex_like(x)
def validate_y(y: pd.Series) -> None: """ Validates if input response variable is correct and doesn't contain invalid input. Args ---- y: pd.Series Response variable sent in input data in first column. Raises ------ ValueError: if values in `y` are Null. if less than 3 (three) non-null values in `y` (as in this case we can't even train a model). if `y` is constant (in this case it doesn't make much sense to make predictions as the time series doesn't change in the training phase. """ if np.all(y.isna()): raise ValueError('Input response cannot have just Null values.') if y.notna().values.sum() < 3: raise ValueError('Input response must have more than 3 non-null ' 'points at least.') if y.std(skipna=True, ddof=0) == 0: raise ValueError('Input response cannot be constant.')
def _predict_core(self, s: pd.Series) -> pd.Series: predicted = (s > (self.high if (self.high is not None) else float("inf"))) | ( s < (self.low if (self.low is not None) else -float("inf"))) predicted[s.isna()] = np.nan return predicted
def test_nanops_independent_of_mask_param(operation): # GH22764 s = Series([1, 2, np.nan, 3, np.nan, 4]) mask = s.isna() median_expected = operation(s) median_result = operation(s, mask=mask) assert median_expected == median_result
def test_getitem_boolean_empty(): s = Series([], dtype=np.int64) s.index.name = 'index_name' s = s[s.isna()] assert s.index.name == 'index_name' assert s.dtype == np.int64 # GH5877 # indexing with empty series s = Series(['A', 'B']) expected = Series(np.nan, index=['C'], dtype=object) result = s[Series(['C'], dtype=object)] assert_series_equal(result, expected) s = Series(['A', 'B']) expected = Series(dtype=object, index=Index([], dtype='int64')) result = s[Series([], dtype=object)] assert_series_equal(result, expected) # invalid because of the boolean indexer # that's empty or not-aligned def f(): s[Series([], dtype=bool)] pytest.raises(IndexingError, f) def f(): s[Series([True], dtype=bool)] pytest.raises(IndexingError, f)
def _get_pdf(series: pd.Series): s = series.loc[series.notna()] return pd.concat([ series.loc[series.isna()], pd.Series(stats.norm.pdf(s), index=s.index) ], verify_integrity=True)
def test_getitem_boolean_empty(self): ser = Series([], dtype=np.int64) ser.index.name = "index_name" ser = ser[ser.isna()] assert ser.index.name == "index_name" assert ser.dtype == np.int64 # GH#5877 # indexing with empty series ser = Series(["A", "B"]) expected = Series(dtype=object, index=Index([], dtype="int64")) result = ser[Series([], dtype=object)] tm.assert_series_equal(result, expected) # invalid because of the boolean indexer # that's empty or not-aligned msg = ( r"Unalignable boolean Series provided as indexer \(index of " r"the boolean Series and of the indexed object do not match" ) with pytest.raises(IndexingError, match=msg): ser[Series([], dtype=bool)] with pytest.raises(IndexingError, match=msg): ser[Series([True], dtype=bool)]
def test_isna(self): # GH 13737 s = Series( [pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False]))
def create_model(self, prior: R.SdPrior, data: pd.Series): """ Args: prior: an R.SdPrior object describing the prior distribution on the residual variance paramter. data: The time series of observations as a Pandas Series. Returns: A boom.StateSpaceModel object. """ boom_data = boom.Vector(data.values) is_observed = ~data.isna() self._model = boom.StateSpaceModel(boom_data, is_observed) if prior is None: sdy = np.std(data) prior = R.SdPrior(sigma_guess=sdy, upper_limit=sdy * 1.2) boom_prior = boom.ChisqModel(prior.sample_size, prior.sigma_guess) observation_model_sampler = boom.ZeroMeanGaussianConjSampler( self._model.observation_model, boom_prior) observation_model_sampler.set_sigma_upper_limit( prior.upper_limit) self._model.observation_model.set_method(observation_model_sampler) sampler = boom.StateSpacePosteriorSampler( self._model, boom.GlobalRng.rng) self._model.set_method(sampler) self._original_series = data return self._model
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values label = Categorical( ["d", "a", "b", "a", "d", "b"], categories=["a", "b", "missing", "d"], ordered=ordered, ) val = Series(["d", "a", "b", "a", "d", "b"]) df = DataFrame({"label": label, "val": val}) # aggregate on the Categorical result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first") # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None label = Series(result.index.array, dtype="object") aggr = Series(result.array) if not observed: aggr[aggr.isna()] = "missing" if not all(label == aggr): msg = ("Labels and aggregation results not consistently sorted\n" f"for (ordered={ordered}, observed={observed}, sort={sort})\n" f"Result:\n{result}") assert False, msg
def process_labels(labels: pd.Series, delim="|"): if labels is not None and type(labels) is pd.Series: if labels.isna().any(): labels.fillna("None", inplace=True) if labels.dtype == "object" and labels.str.contains(delim).any(): labels = labels.str.split(delim, expand=True)[0].astype(str) return labels
def test_isna(self): # GH 13737 s = Series( [pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False]))
def get_errors(self, series: pd.Series, column: 'column.Column'): errors = [] # Calculate which columns are valid using the child class's validate function, skipping empty entries if the # column specifies to do so simple_validation = ~self.validate(series) if column.allow_empty: # Failing results are those that are not empty, and fail the validation if np.issubdtype(series.dtype, np.number): validated = ~series.isna() & simple_validation else: validated = (series.str.len() > 0) & simple_validation else: validated = simple_validation # Cut down the original series to only ones that failed the validation indices = series.index[validated] # Use these indices to find the failing items. Also print the index which is probably a row number for i in indices: element = series[i] errors.append( ValidationWarning(message=self.message, value=element, row=i, column=series.name)) return errors
def disorder_mapping(X: Series, y: Series, bad_y: Any = 1, null_value: List = None, flag: int = 0) -> Dict: """ 无序变量转码 :param X: 变量数据 :param y: y标签数据 :param bad_y: 坏样本值 :param null_value: 缺失值 :param flag: 是否考虑缺失值 :return: """ if flag == 1: mask = (X.isin(null_value) | X.isna()) X = X[~mask] y = y[~mask] else: mask = X.isna() X = X[~mask] y = y[~mask] B = (y == bad_y).sum() G = y.size - B unique_value = X.unique() mask = (unique_value.reshape(-1, 1) == X.values) mask_bad = mask & (y.values == bad_y) b = mask_bad.sum(axis=1) g = mask.sum(axis=1) - b woe_value = np.around(woe_single_all(B, G, b, g), 6) woe_value_sort = np.argsort(woe_value) X = X.map(dict(zip(unique_value, woe_value_sort))) tree = DecisionTreeClassifier(max_leaf_nodes=6, min_samples_leaf=max(int(X.size * 0.05), 50)) tree.fit(X.values.reshape(-1, 1), y) threshold = [-inf] threshold.extend( np.sort(tree.tree_.threshold[tree.tree_.feature == 0]).tolist()) threshold.append(inf) index = pd.cut(woe_value_sort, threshold, right=True, include_lowest=True, labels=False) res = dict(zip(unique_value.tolist(), index.tolist())) return res
def slim_col( x: pd.Series, nan_replacements: Tuple[int, str] = (-99, "blank"), inplace: bool = False, ) -> pd.Series: """ Type fit function to slim a specific column. :param x: data column :param nan_replacements: column nan replacements, including numeric nan and category nan values, i.e. -99 or 'blank' :param inplace: inplace change or not (default is False) :return: slimed column """ num_nan, cat_nan = nan_replacements origin_type = x.dtype.name x_has_nan = x.isna().values.any() # bool: series中发现nan -> True if is_numeric_dtype(x): if x_has_nan: if inplace: x.fillna(value=num_nan, inplace=inplace) else: x = x.fillna(value=num_nan, inplace=inplace) as_int = x.fillna(0).astype(np.int64) # 若inplace=False则已经填充过了-99 is_int = np.allclose(x, as_int) mn, mx = x.min(), x.max() if is_int: if mn >= 0: if mx < 255: x = x.astype(np.uint8) elif mx < 65535: x = x.astype(np.uint16) elif mx < 4294967295: x = x.astype(np.uint32) else: x = x.astype(np.uint64) else: if -128 < mn and mx < 127: x = x.astype(np.int8) elif -32768 < mn and mx < 32767: x = x.astype(np.int16) elif -2147483648 < mn and mx < 2147483647: x = x.astype(np.int32) elif -9223372036854775808 < mn and mx < 9223372036854775807: x = x.astype(np.int64) else: raise OverflowError("Integer overflow encountered!") # 若为分类型, 则填充为'blank'并将列类型设置为category,Converting string variable to a categorical variable will save memory elif is_string_dtype(x) or is_categorical_dtype(x) or is_object_dtype(x): if x_has_nan: if inplace: x.fillna(value=cat_nan, inplace=inplace) else: x = x.fillna(value=cat_nan, inplace=inplace) x = x.astype("category") logger.info("Column {} data type changes from `{}` to `{}`.".format( repr(x.name), origin_type, x.dtype.name)) return x
def __init__(self, column: pd.Series): self.data = column self.name = str(column.name) self.type = self.get_type() self.count = column.size self.count_distinct = column.nunique(dropna=False) self.count_null = column.isna().sum() self.max_groups_allowed = 20 # for group by operations
def get_shift_digit(s: pd.Series) -> pd.Series: """ Возвращает колонку с изменениями значений Для вызова этой функции необходимо очистить колонки Close от NaN """ assert not s.isna().any(), "Необходимо очистить колонку от NaN" return (s.shift(-1) - s).shift(1)
def test_isna_for_inf(self): s = Series(['a', np.inf, np.nan, 1.0]) with pd.option_context('mode.use_inf_as_na', True): r = s.isna() dr = s.dropna() e = Series([False, True, True, False]) de = Series(['a', 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de)
def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None ser = Series(['x', np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1])
def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) ser[::2] = np.nan mask = ser.isna() filled = ser.fillna(ser[0]) result = bool_op(ser < ser[9], ser > ser[3]) expected = bool_op(filled < filled[9], filled > filled[3]) expected[mask] = False assert_series_equal(result, expected)
def test_isnull_for_inf_deprecated(self): # gh-17115 s = Series(['a', np.inf, np.nan, 1.0]) with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): pd.set_option('mode.use_inf_as_null', True) r = s.isna() dr = s.dropna() pd.reset_option('mode.use_inf_as_null') e = Series([False, True, True, False]) de = Series(['a', 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de)
def test_isna(self): ser = Series([0, 5.4, 3, nan, -0.001]) np.array_equal(ser.isna(), Series([False, False, False, True, False]).values) ser = Series(["hi", "", nan]) np.array_equal(ser.isna(), Series([False, False, True]).values)
def test_isna(self): # GH 13737 s = Series([pd.Period('2011-01', freq='M'), pd.Period('NaT', freq='M')]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False]))