def test_fillna_consistency(self): # GH 16402 # fillna with a tz aware to a tz-naive, should result in object s = Series([Timestamp('20130101'), pd.NaT]) result = s.fillna(Timestamp('20130101', tz='US/Eastern')) expected = Series([Timestamp('20130101'), Timestamp('2013-01-01', tz='US/Eastern')], dtype='object') assert_series_equal(result, expected) # where (we ignore the errors=) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) # with a non-datetime result = s.fillna('foo') expected = Series([Timestamp('20130101'), 'foo']) assert_series_equal(result, expected) # assignment s2 = s.copy() s2[1] = 'foo' assert_series_equal(s2, expected)
def test_mask(): # compare with tested results in test_where s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(~cond, np.nan) assert_series_equal(rs, s.mask(cond)) rs = s.where(~cond) rs2 = s.mask(cond) assert_series_equal(rs, rs2) rs = s.where(~cond, -s) rs2 = s.mask(cond, -s) assert_series_equal(rs, rs2) cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) rs = s2.where(~cond[:3]) rs2 = s2.mask(cond[:3]) assert_series_equal(rs, rs2) rs = s2.where(~cond[:3], -s2) rs2 = s2.mask(cond[:3], -s2) assert_series_equal(rs, rs2) pytest.raises(ValueError, s.mask, 1) pytest.raises(ValueError, s.mask, cond[:3].values, -s) # dtype changes s = Series([1, 2, 3, 4]) result = s.mask(s > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) assert_series_equal(result, expected)
def test_where(): s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(cond).dropna() rs2 = s[cond] assert_series_equal(rs, rs2) rs = s.where(cond, -s) assert_series_equal(rs, s.abs()) rs = s.where(cond) assert (s.shape == rs.shape) assert (rs is not s) # test alignment cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) rs = s2.where(cond[:3]) assert_series_equal(rs, expected) expected = s2.abs() expected.iloc[0] = s2[0] rs = s2.where(cond[:3], -s2) assert_series_equal(rs, expected)
def test_where_raise_on_error_deprecation(): # gh-14968 # deprecation of raise_on_error s = Series(np.random.randn(5)) cond = s > 0 with tm.assert_produces_warning(FutureWarning): s.where(cond, raise_on_error=True) with tm.assert_produces_warning(FutureWarning): s.mask(cond, raise_on_error=True)
def test_where_invalid_input(cond): # see gh-15414: only boolean arrays accepted s = Series([1, 2, 3]) msg = "Boolean array expected for the condition" with pytest.raises(ValueError, match=msg): s.where(cond) msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.where([True])
def test_where_inplace(): s = Series(np.random.randn(5)) cond = s > 0 rs = s.copy() rs.where(cond, inplace=True) assert_series_equal(rs.dropna(), s[cond]) assert_series_equal(rs, s.where(cond)) rs = s.copy() rs.where(cond, -s, inplace=True) assert_series_equal(rs, s.where(cond, -s))
def test_where_array_like(klass): # see gh-15414 s = Series([1, 2, 3]) cond = [False, True, True] expected = Series([np.nan, 2, 3]) result = s.where(klass(cond)) assert_series_equal(result, expected)
def test_mask(): # compare with tested results in test_where s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(~cond, np.nan) assert_series_equal(rs, s.mask(cond)) rs = s.where(~cond) rs2 = s.mask(cond) assert_series_equal(rs, rs2) rs = s.where(~cond, -s) rs2 = s.mask(cond, -s) assert_series_equal(rs, rs2) cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) rs = s2.where(~cond[:3]) rs2 = s2.mask(cond[:3]) assert_series_equal(rs, rs2) rs = s2.where(~cond[:3], -s2) rs2 = s2.mask(cond[:3], -s2) assert_series_equal(rs, rs2) msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.mask(1) with pytest.raises(ValueError, match=msg): s.mask(cond[:3].values, -s) # dtype changes s = Series([1, 2, 3, 4]) result = s.mask(s > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) assert_series_equal(result, expected) # see gh-21891 s = Series([1, 2]) res = s.mask([True, False]) exp = Series([np.nan, 2]) tm.assert_series_equal(res, exp)
def test_where_unsafe(): # see gh-9731 s = Series(np.arange(10), dtype="int64") values = [2.5, 3.5, 4.5, 5.5] mask = s > 5 expected = Series(lrange(6) + values, dtype="float64") s[mask] = values assert_series_equal(s, expected) # see gh-3235 s = Series(np.arange(10), dtype='int64') mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype='int64') mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 def f(): s[mask] = [5, 4, 3, 2, 1] pytest.raises(ValueError, f) def f(): s[mask] = [0] * 5 pytest.raises(ValueError, f) # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) assert_series_equal(result, expected)
def test_where_timedelta_coerce(): s = Series([1, 2], dtype='timedelta64[ns]') expected = Series([10, 10]) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) assert_series_equal(rs, expected) rs = s.where(mask, 10) assert_series_equal(rs, expected) rs = s.where(mask, 10.0) assert_series_equal(rs, expected) rs = s.where(mask, [10.0, 10.0]) assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) expected = Series([10, None], dtype='object') assert_series_equal(rs, expected)
def consecutive_wins_losses(self): ''' Calculates the positive and negative runs in the trade series. ''' trade_df = self.as_dataframe().sort_values(by = 'exit') win_loss = sign(trade_df.base_return) # Create series which has just 1's and 0's positive = Series(hstack(([0], ((win_loss > 0) * 1).values, [0]))) negative = Series(hstack(([0], ((win_loss < 0) * 1).values, [0]))) pos_starts = positive.where(positive.diff() > 0) pos_starts = Series(pos_starts.dropna().index.tolist()) pos_ends = positive.where(positive.diff() < 0) pos_ends = Series(pos_ends.dropna().index.tolist()) positive_runs = pos_ends - pos_starts neg_starts = negative.where(negative.diff() > 0) neg_starts = Series(neg_starts.dropna().index.tolist()) neg_ends = negative.where(negative.diff() < 0) neg_ends = Series(neg_ends.dropna().index.tolist()) negative_runs = neg_ends - neg_starts return (positive_runs, negative_runs)
def test_where_unsafe(): # see gh-9731 s = Series(np.arange(10), dtype="int64") values = [2.5, 3.5, 4.5, 5.5] mask = s > 5 expected = Series(list(range(6)) + values, dtype="float64") s[mask] = values tm.assert_series_equal(s, expected) # see gh-3235 s = Series(np.arange(10), dtype="int64") mask = s < 5 s[mask] = range(2, 7) expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") tm.assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype="int64") mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") tm.assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[mask] = [5, 4, 3, 2, 1] with pytest.raises(ValueError, match=msg): s[mask] = [0] * 5 # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) tm.assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) tm.assert_series_equal(result, expected)
def test_where_unsafe(): # see gh-9731 s = Series(np.arange(10), dtype="int64") values = [2.5, 3.5, 4.5, 5.5] mask = s > 5 expected = Series(lrange(6) + values, dtype="float64") s[mask] = values assert_series_equal(s, expected) # see gh-3235 s = Series(np.arange(10), dtype='int64') mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype='int64') mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 msg = "cannot assign mismatch length to masked array" with pytest.raises(ValueError, match=msg): s[mask] = [5, 4, 3, 2, 1] with pytest.raises(ValueError, match=msg): s[mask] = [0] * 5 # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) assert_series_equal(result, expected)
def test_where_datetime_conversion(): s = Series(date_range("20130102", periods=2)) expected = Series([10, 10]) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) tm.assert_series_equal(rs, expected) rs = s.where(mask, 10) tm.assert_series_equal(rs, expected) rs = s.where(mask, 10.0) tm.assert_series_equal(rs, expected) rs = s.where(mask, [10.0, 10.0]) tm.assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) expected = Series([10, None], dtype="object") tm.assert_series_equal(rs, expected) # GH 15701 timestamps = [ "2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00" ] s = Series([Timestamp(t) for t in timestamps]) rs = s.where(Series([False, True])) expected = Series([pd.NaT, s[1]]) tm.assert_series_equal(rs, expected)
def test_where_datetime_conversion(): s = Series(date_range('20130102', periods=2)) expected = Series([10, 10]) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) assert_series_equal(rs, expected) rs = s.where(mask, 10) assert_series_equal(rs, expected) rs = s.where(mask, 10.0) assert_series_equal(rs, expected) rs = s.where(mask, [10.0, 10.0]) assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) expected = Series([10, None], dtype='object') assert_series_equal(rs, expected) # GH 15701 timestamps = ['2016-12-31 12:00:04+00:00', '2016-12-31 12:00:04.010000+00:00'] s = Series([pd.Timestamp(t) for t in timestamps]) rs = s.where(Series([False, True])) expected = Series([pd.NaT, s[1]]) assert_series_equal(rs, expected)
def test_where_dt_tz_values(tz_naive_fixture): ser1 = Series( pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture)) ser2 = Series( pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture)) mask = Series([True, True, False]) result = ser1.where(mask, ser2) exp = Series( pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture)) tm.assert_series_equal(exp, result)
def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 sparse = SparseSeries(data) result = sparse.where(sparse > lower_bound, other) dense = Series(data) dense_expected = dense.where(dense > lower_bound, other) sparse_expected = SparseSeries(dense_expected, fill_value=other) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def _drop_duplicated_marker(self, marker_column: pd.Series, start: bool = True): """Modify marker column to keep only first start marker or last end marker. Parameters ---------- marker_column: pd.Series Values for which duplicated markers will be removed. start: bool, optional Indicate which duplicates should be dropped. If True, only first start marker is kept. If False, only last end marker is kept. Returns ------- dropped: pd.Series """ valid_values = [self.marker_start, self.marker_end] denoised = marker_column.where(marker_column.isin(valid_values)) if start: fill = denoised.ffill() marker = 1 shift = 1 else: fill = denoised.bfill() marker = 2 shift = -1 shifted = fill.shift(shift) shifted_start_only = shifted.where(fill.eq(marker)) mask_drop = (shifted_start_only == marker_column) dropped = marker_column.where(~mask_drop) return dropped
def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 sparse = SparseSeries(data) result = sparse.where(sparse > lower_bound) dense = Series(data) dense_expected = dense.where(dense > lower_bound) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_where_error(): s = Series(np.random.randn(5)) cond = s > 0 msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.where(1) with pytest.raises(ValueError, match=msg): s.where(cond[:3].values, -s) # GH 2745 s = Series([1, 2]) s[[True, False]] = [0, 1] expected = Series([0, 2]) tm.assert_series_equal(s, expected) # failures msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] with pytest.raises(ValueError, match=msg): s[[True, False]] = []
def test_where_with_bool_data_and_other(other): # GH 17386 data = [False, False, True, True, False, False] cond = True sparse = SparseSeries(data) result = sparse.where(sparse == cond, other) dense = Series(data) dense_expected = dense.where(dense == cond, other) sparse_expected = SparseSeries(dense_expected, fill_value=other) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_where_with_bool_data(): # GH 17386 data = [False, False, True, True, False, False] cond = True sparse = SparseSeries(data) result = sparse.where(sparse == cond) dense = Series(data) dense_expected = dense.where(dense == cond) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_fillna_consistency(self): # GH#16402 # fillna with a tz aware to a tz-naive, should result in object ser = Series([Timestamp("20130101"), NaT]) result = ser.fillna(Timestamp("20130101", tz="US/Eastern")) expected = Series( [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], dtype="object", ) tm.assert_series_equal(result, expected) msg = "The 'errors' keyword in " with tm.assert_produces_warning(FutureWarning, match=msg): # where (we ignore the errors=) result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore") tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore") tm.assert_series_equal(result, expected) # with a non-datetime result = ser.fillna("foo") expected = Series([Timestamp("20130101"), "foo"]) tm.assert_series_equal(result, expected) # assignment ser2 = ser.copy() ser2[1] = "foo" tm.assert_series_equal(ser2, expected)
def test_where_error(): s = Series(np.random.randn(5)) cond = s > 0 msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.where(1) with pytest.raises(ValueError, match=msg): s.where(cond[:3].values, -s) # GH 2745 s = Series([1, 2]) s[[True, False]] = [0, 1] expected = Series([0, 2]) assert_series_equal(s, expected) # failures msg = "cannot assign mismatch length to masked array" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] msg = ("NumPy boolean array indexing assignment cannot assign 0 input" " values to the 1 output values where the mask is true") with pytest.raises(ValueError, match=msg): s[[True, False]] = []
def test_where_error(): s = Series(np.random.randn(5)) cond = s > 0 msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.where(1) with pytest.raises(ValueError, match=msg): s.where(cond[:3].values, -s) # GH 2745 s = Series([1, 2]) s[[True, False]] = [0, 1] expected = Series([0, 2]) tm.assert_series_equal(s, expected) # failures msg = "cannot assign mismatch length to masked array" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] msg = ("NumPy boolean array indexing assignment cannot assign 0 input " "values to the 1 output values where the mask is true") with pytest.raises(ValueError, match=msg): s[[True, False]] = []
def test_where_numeric_with_string(): # GH 9280 s = Series([1, 2, 3]) w = s.where(s > 1, "X") assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) assert w.dtype == "object" w = s.where(s > 1, ["X", "Y", "Z"]) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) assert w.dtype == "object" w = s.where(s > 1, np.array(["X", "Y", "Z"])) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) assert w.dtype == "object"
def imputation_normal_distribution(log_intensities: pd.Series, mean_shift=IMPUTATION_MEAN_SHIFT, std_shrinkage=IMPUTATION_STD_SHRINKAGE): """Impute missing log-transformed intensity values of DDA run. Parameters ---------- log_intensities: pd.Series Series of normally distributed values. Here usually log-transformed protein intensities. mean_shift: integer, float Shift the mean of the log_intensities by factors of their standard deviation to the negative. std_shrinkage: float Value greater than zero by which to shrink (or inflate) the standard deviation of the log_intensities. """ np.random.seed(RANDOM_SEED) if not isinstance(log_intensities, pd.Series): try: # array-like, Iterable, dict, or scalar value? log_intensities = pd.Series(log_intensities) logger.warning("Series created of Iterable.") except Exception as e: raise ValueError( "Plese provided data which is a pandas.Series or an Iterable", e) if mean_shift < 0: raise ValueError( "Please specify a positive float as the std.-dev. is non-negative." ) if std_shrinkage <= 0: raise ValueError( "Please specify a positive float as shrinkage factor for std.-dev." ) if std_shrinkage >= 1: logger.warning("Standard Deviation will increase for imputed values.") mean = log_intensities.mean() std = log_intensities.std() mean_shifted = mean - (std * mean_shift) std_shrinked = std * std_shrinkage return log_intensities.where(log_intensities.notna(), np.random.normal(mean_shifted, std_shrinked))
def encode_X(self, X: pd.Series): if self.tokenizer is None: self.logger.error( "Please initial the embedding by Word_Embedding().init_embedding_layer first" ) return None # TODO: fix me, this shouldn't happen!! X = X.where((pd.notnull(X)), '') self.logger.info("X.head={}".format(X.head(5))) self.logger.info("X.shape={}".format(X.shape)) self.logger.info("X.values.shape={}".format(X.values.shape)) X = X.values.ravel() X = self.tokenizer.texts_to_sequences(X) self.logger.info("sequance X {}".format(X)) X = sequence.pad_sequences(X, maxlen=self.max_text_len) self.logger.info("padding X {}".format(X)) return X
def invalid_type_vector(series: pd.Series, csv_schema, type_field=None): """ Provide a Series in a vectorized operation where values with valid datatypes are nulled-out (set to NaN), only leaving invalid values. Useful for building error reports. Args: series: the pd.Series to act on, often provided as a column slice of a DataFrame. csv_schema: the schema containing the details about the columns for this file type_field: the column name this series represents. Used to lookup type information from the schema, and should be provided if the `series.name` is not populated on the given `series`. Returns: A new Series with only invalid datatypes remaining from the one provided. """ return series.where( valid_type_bool_vector(series, csv_schema, type_field, match_invalid=True))
def freqdiff_metrics(orig_freqs: pd.Series, synth_freqs: pd.Series, metrics: Optional[List[str]] = None, ) -> pd.Series: '''Compute frequency mismatch metrics for two value frequency series. :param orig_freqs: Frequencies of values (or their intervals) in the original dataframe column. :param synth_freqs: Frequencies of values (or their intervals) in the matching synthesized column. :param metrics: Names of metrics to include. If None, all metrics are computed. For a list of metrics, see :func:`frequency_mismatch`. :returns: A Series with metric values, with their names in the index. ''' diff = synth_freqs - orig_freqs simpson_orig = (orig_freqs ** 2).sum() simpson_synth = (synth_freqs ** 2).sum() overlap = orig_freqs.where(orig_freqs <= synth_freqs, synth_freqs) metric_series = pd.Series({ 'rtae': abs(diff).sum(), 'overlap_coef': overlap.sum(), 'rank_damerau': damerau_levenshtein( orig_freqs.sort_values().index.tolist(), synth_freqs.sort_values().index.tolist(), ) / len(orig_freqs.index), 'morisita_overlap': ( 2 * (orig_freqs * synth_freqs).sum() / (simpson_orig + simpson_synth) ), 'mae': abs(diff).mean(), 'rmse': (diff ** 2).mean() ** .5, 'jaccard_dist': 1 - overlap.sum(), 'simpson_diff': simpson_synth - simpson_orig, 'entropy_diff': ( (synth_freqs[synth_freqs>0] * np.log(synth_freqs[synth_freqs>0])).sum() - (orig_freqs[orig_freqs>0] * np.log(orig_freqs[orig_freqs>0])).sum() ) }) if metrics is None: return metric_series else: return metric_series[metrics]
def test_where_datetimelike_noop(self, dtype): # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) df = ser.to_frame() mask = np.array([False, False, False]) res = ser.where(~mask, "foo") tm.assert_series_equal(res, ser) mask2 = mask.reshape(-1, 1) res2 = df.where(~mask2, "foo") tm.assert_frame_equal(res2, df) res3 = ser.mask(mask, "foo") tm.assert_series_equal(res3, ser) res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df)
def daily_growth_rate(series: pd.Series, **kwargs): PERIOD = 7 THRESHOLD = 10 # minimum cases per day on average # growth rate series = series.rolling(PERIOD).mean() series = series.where(series >= THRESHOLD, other=np.nan) # ignore small data k = np.log( series / series.shift(PERIOD)) / PERIOD * 100 # daily growth rate % if k.isna().all(): return None fig, ax = plt.subplots() line(ax, k, kwargs) ax.axhline(0, color='#999999', lw=0.5) previous_lfooter = kwargs['lfooter'] if 'lfooter' in kwargs else '' kwargs[ 'lfooter'] = f'When daily new cases >= {THRESHOLD}; ' + previous_lfooter finalise_plot(ax, **kwargs) return None
def test_broadcast(size, mask, item, box): selection = np.resize(mask, size) data = np.arange(size, dtype=float) # Construct the expected series by taking the source # data or item based on the selection expected = Series([item if use_item else data[ i] for i, use_item in enumerate(selection)]) s = Series(data) s[selection] = box(item) assert_series_equal(s, expected) s = Series(data) result = s.where(~selection, box(item)) assert_series_equal(result, expected) s = Series(data) result = s.mask(selection, box(item)) assert_series_equal(result, expected)
def rolling_mean_excluding_self(series: pd.Series, window: int = 15) -> pd.Series: """ Calculate the rolling hollow_mean() for each element in a series. Note: negative items in the series will be treated as zero. Note: at the head of the series, items up until window/2 are returned as the the rolling_mean Note: at the tail of the series, them same mean is returned for the final window/2 items Arguments: - series - pandas Series - the series for which the hollow-mean will be calculated for each item. - window - the size of the rolling window used for calculating the hollow_mean() Returns: a series of means""" positive = series.where(series >= 0, other=0) mean = positive.rolling(window, center=True).apply(hollow_mean) for n in range(int(window / 2) + 1, window): position = -(window - n) mean.iloc[position] = hollow_mean(positive[-window:], n) return mean
def keepTopN(column: pandas.Series, n: int, default: Optional[object] = None) -> pandas.Series: """ Keeps the top n most popular values of a Series, while replacing the rest with `default` Args: column (pandas.Series): Series to operate on n (int): How many values to keep default (object, optional): Defaults to NaN. Value with which to replace remaining values Returns: pandas.Series: Series with the most popular n values """ if default is None: default = numpy.nan val_counts = column.value_counts() if n > len(val_counts): n = len(val_counts) top_n = list(val_counts[:n].index) return (column.where(column.isin(top_n), other=default))
def convert_time_to_most_suitable_unit(arr): from pandas import Series from numpy import array, isnat, diff, NaN, nanmedian # test if dates arr = array(arr) if isnat(arr).any(): return arr # convert to datetime[ns] floats time = array(arr).astype("datetime64[ns]").astype(float) # get the difference between time steps delta_time = diff(time) # approximate the best unit (without losing info) time_denominators = dict(ns=1, s=1e9, m=60, h=60, D=24, M=30, Y=12) dt_as_frac_of_unit = Series(index=time_denominators.keys()) denominator = 1 for key in time_denominators: denominator *= time_denominators[key] frac = nanmedian(delta_time / denominator) # only units that will not lose time are kept dt_as_frac_of_unit[key] = frac if frac >= 1 else NaN # if the difference is not near enough the unit, exclude it # e.g. 35 day interval will eliminate Month as a unit if not ((dt_as_frac_of_unit - 1) < 0.05).any(): dt_as_frac_of_unit = dt_as_frac_of_unit.where(lambda a: (a - 1) >= 1) unit = dt_as_frac_of_unit.idxmin() # convert time units to appropriate units # dtype: datetime64 must be attached to unit # must be float when astype(float) is applied time_converted = arr.astype(f"datetime64[{unit}]") return time_converted
def test_fillna_float_casting(self, dtype, fill_type, scalar): # GH-43424 ser = Series([np.nan, 1.2], dtype=dtype) fill_values = Series([2, 2], dtype=fill_type) if scalar: fill_values = fill_values.dtype.type(2) result = ser.fillna(fill_values) expected = Series([2.0, 1.2], dtype=dtype) tm.assert_series_equal(result, expected) ser = Series([np.nan, 1.2], dtype=dtype) mask = ser.isna().to_numpy() ser[mask] = fill_values tm.assert_series_equal(ser, expected) ser = Series([np.nan, 1.2], dtype=dtype) ser.mask(mask, fill_values, inplace=True) tm.assert_series_equal(ser, expected) ser = Series([np.nan, 1.2], dtype=dtype) res = ser.where(~mask, fill_values) tm.assert_series_equal(res, expected)
def custom_series_function(ser: pd.Series, within: int) -> pd.core.series.Series: """A more challenging mask to apply. When passed a series of floats, return all values within the given rage of: - the minimum value - the 1st quartile value - the second quartile value - the mean - the third quartile value - the maximum value You may want to brush up on some simple statistics to help you here. Also, the series is passed to you sorted assending. Be sure that you don't return values out of sequence. So, for example if you mean is 5.0 and within is 0.1 return all value between 4.9 and 5.1 inclusive :param ser: Series to perform operation on :param within: The value to calculate the range of number within """ stat = ser.describe() def checkem(x): tst = None for k, st in stat.items(): if k in ['count', 'std']: continue # print(f'{k}: {st-within:.2}——{st+within:.2}') if tst is None: tst = ((st - within) <= x) & (x <= (st + within)) else: tst = tst | (((st - within) <= x) & (x <= (st + within))) return tst res = ser.where(checkem(ser)).dropna() return res
def sec_kill(p_v: pd.Series, y0: int) -> pd.Series: """ 根据我方出射点坐标速度,判断对方简单反弹时是否可以秒杀 :param p_v:pd.Series,出射点坐标速度 :param y0:int,出射点坐标坐标 :return: pd.Series 元素为bool类型 >>> sec_kill(pd.Series([-1000, 400]), 10) 0 False 1 False dtype: bool """ # 已完成测试 # 镜像点坐标。Y:pd.Series Y = y0 + STEP * p_v # 把镜像点Y转化为真实点,然后求合法速度区间 y, count = mirror2real(Y) # 对方竖直速度范围 v_range:tuple 元素为四个Series v_range = ball_v_range(y) # 返回True or False,True表示会被秒杀,False表示不会 op_v = p_v.where(count % 2 == 0, -p_v) # op_v = p_v if (count % 2 == 0) else (- p_v) # 经测试,逻辑连接符可以用&|-表示与或非,但是似乎不可以用and,or,not。注意优先级不同,必须加括号 return -(((op_v >= v_range[3]) & (op_v <= v_range[2])) | ((op_v >= v_range[1]) & (op_v <= v_range[0])))
def five_day_on_five_day(series: pd.Series, **kwargs): PERIOD = 5 THRESHOLD = 20 # minimum cases per day on average # growth rate series = series.rolling(PERIOD).mean() series = series.where(series >= THRESHOLD, other=np.nan) # ignore small data growth = series / series.shift(PERIOD) if growth.isna().all(): return None # plot fig, ax = plt.subplots() line(ax, growth, kwargs) ax.axhline(1, color='#999999', lw=0.5) previous_lfooter = kwargs['lfooter'] if 'lfooter' in kwargs else '' kwargs['lfooter'] = ( f'When daily new cases >= {THRESHOLD}; ' + f'Latest datapoint: {growth.dropna().iloc[-1]:.2f}; ' + previous_lfooter) finalise_plot(ax, **kwargs) return None
def test_where_broadcast(): # Test a variety of differently sized series for size in range(2, 6): # Test a variety of boolean indices for selection in [ # First element should be set np.resize([True, False, False, False, False], size), # Set alternating elements] np.resize([True, False], size), # No element should be set np.resize([False], size) ]: # Test a variety of different numbers as content for item in [ 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min ]: # Test numpy arrays, lists and tuples as the input to be # broadcast for arr in [np.array([item]), [item], (item, )]: data = np.arange(size, dtype=float) s = Series(data) s[selection] = arr # Construct the expected series by taking the source # data or item based on the selection expected = Series([ item if use_item else data[i] for i, use_item in enumerate(selection) ]) assert_series_equal(s, expected) s = Series(data) result = s.where(~selection, arr) assert_series_equal(result, expected)
def test_where_ndframe_align(): msg = "Array conditional must be same shape as self" s = Series([1, 2, 3]) cond = [True] with pytest.raises(ValueError, match=msg): s.where(cond) expected = Series([1, np.nan, np.nan]) out = s.where(Series(cond)) tm.assert_series_equal(out, expected) cond = np.array([False, True, False, True]) with pytest.raises(ValueError, match=msg): s.where(cond) expected = Series([np.nan, 2, np.nan]) out = s.where(Series(cond)) tm.assert_series_equal(out, expected)
def test_where_new_category_raises(self): ser = Series(Categorical(["a", "b", "c"])) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ser.where([True, False, True], "d")
def test_where_unsafe(): # unsafe dtype changes for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: s = Series(np.arange(10), dtype=dtype) mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype=dtype) assert_series_equal(s, expected) assert s.dtype == expected.dtype # these are allowed operations, but are upcasted for dtype in [np.int64, np.float64]: s = Series(np.arange(10), dtype=dtype) mask = s < 5 values = [2.5, 3.5, 4.5, 5.5, 6.5] s[mask] = values expected = Series(values + lrange(5, 10), dtype='float64') assert_series_equal(s, expected) assert s.dtype == expected.dtype # GH 9731 s = Series(np.arange(10), dtype='int64') mask = s > 5 values = [2.5, 3.5, 4.5, 5.5] s[mask] = values expected = Series(lrange(6) + values, dtype='float64') assert_series_equal(s, expected) # can't do these as we are forced to change the itemsize of the input # to something we cannot for dtype in [np.int8, np.int16, np.int32, np.float16, np.float32]: s = Series(np.arange(10), dtype=dtype) mask = s < 5 values = [2.5, 3.5, 4.5, 5.5, 6.5] pytest.raises(Exception, s.__setitem__, tuple(mask), values) # GH3235 s = Series(np.arange(10), dtype='int64') mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype='int64') mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 def f(): s[mask] = [5, 4, 3, 2, 1] pytest.raises(ValueError, f) def f(): s[mask] = [0] * 5 pytest.raises(ValueError, f) # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) assert_series_equal(result, expected)