Example #1
0
    def test_fillna_consistency(self):
        # GH 16402
        # fillna with a tz aware to a tz-naive, should result in object

        s = Series([Timestamp('20130101'), pd.NaT])

        result = s.fillna(Timestamp('20130101', tz='US/Eastern'))
        expected = Series([Timestamp('20130101'),
                           Timestamp('2013-01-01', tz='US/Eastern')],
                          dtype='object')
        assert_series_equal(result, expected)

        # where (we ignore the errors=)
        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        result = s.where([True, False],
                         Timestamp('20130101', tz='US/Eastern'),
                         errors='ignore')
        assert_series_equal(result, expected)

        # with a non-datetime
        result = s.fillna('foo')
        expected = Series([Timestamp('20130101'),
                           'foo'])
        assert_series_equal(result, expected)

        # assignment
        s2 = s.copy()
        s2[1] = 'foo'
        assert_series_equal(s2, expected)
Example #2
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    assert_series_equal(rs, rs2)

    pytest.raises(ValueError, s.mask, 1)
    pytest.raises(ValueError, s.mask, cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    assert_series_equal(result, expected)
Example #3
0
def test_where():
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(cond).dropna()
    rs2 = s[cond]
    assert_series_equal(rs, rs2)

    rs = s.where(cond, -s)
    assert_series_equal(rs, s.abs())

    rs = s.where(cond)
    assert (s.shape == rs.shape)
    assert (rs is not s)

    # test alignment
    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())

    expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index)
    rs = s2.where(cond[:3])
    assert_series_equal(rs, expected)

    expected = s2.abs()
    expected.iloc[0] = s2[0]
    rs = s2.where(cond[:3], -s2)
    assert_series_equal(rs, expected)
Example #4
0
def test_where_raise_on_error_deprecation():
    # gh-14968
    # deprecation of raise_on_error
    s = Series(np.random.randn(5))
    cond = s > 0
    with tm.assert_produces_warning(FutureWarning):
        s.where(cond, raise_on_error=True)
    with tm.assert_produces_warning(FutureWarning):
        s.mask(cond, raise_on_error=True)
Example #5
0
def test_where_invalid_input(cond):
    # see gh-15414: only boolean arrays accepted
    s = Series([1, 2, 3])
    msg = "Boolean array expected for the condition"

    with pytest.raises(ValueError, match=msg):
        s.where(cond)

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.where([True])
Example #6
0
def test_where_inplace():
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.copy()

    rs.where(cond, inplace=True)
    assert_series_equal(rs.dropna(), s[cond])
    assert_series_equal(rs, s.where(cond))

    rs = s.copy()
    rs.where(cond, -s, inplace=True)
    assert_series_equal(rs, s.where(cond, -s))
Example #7
0
def test_where_array_like(klass):
    # see gh-15414
    s = Series([1, 2, 3])
    cond = [False, True, True]
    expected = Series([np.nan, 2, 3])

    result = s.where(klass(cond))
    assert_series_equal(result, expected)
Example #8
0
def test_mask():
    # compare with tested results in test_where
    s = Series(np.random.randn(5))
    cond = s > 0

    rs = s.where(~cond, np.nan)
    assert_series_equal(rs, s.mask(cond))

    rs = s.where(~cond)
    rs2 = s.mask(cond)
    assert_series_equal(rs, rs2)

    rs = s.where(~cond, -s)
    rs2 = s.mask(cond, -s)
    assert_series_equal(rs, rs2)

    cond = Series([True, False, False, True, False], index=s.index)
    s2 = -(s.abs())
    rs = s2.where(~cond[:3])
    rs2 = s2.mask(cond[:3])
    assert_series_equal(rs, rs2)

    rs = s2.where(~cond[:3], -s2)
    rs2 = s2.mask(cond[:3], -s2)
    assert_series_equal(rs, rs2)

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.mask(1)
    with pytest.raises(ValueError, match=msg):
        s.mask(cond[:3].values, -s)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.mask(s > 2, np.nan)
    expected = Series([1, 2, np.nan, np.nan])
    assert_series_equal(result, expected)

    # see gh-21891
    s = Series([1, 2])
    res = s.mask([True, False])

    exp = Series([np.nan, 2])
    tm.assert_series_equal(res, exp)
Example #9
0
def test_where_unsafe():
    # see gh-9731
    s = Series(np.arange(10), dtype="int64")
    values = [2.5, 3.5, 4.5, 5.5]

    mask = s > 5
    expected = Series(lrange(6) + values, dtype="float64")

    s[mask] = values
    assert_series_equal(s, expected)

    # see gh-3235
    s = Series(np.arange(10), dtype='int64')
    mask = s < 5
    s[mask] = lrange(2, 7)
    expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
    assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
    assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    def f():
        s[mask] = [5, 4, 3, 2, 1]

    pytest.raises(ValueError, f)

    def f():
        s[mask] = [0] * 5

    pytest.raises(ValueError, f)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    assert_series_equal(result, expected)
Example #10
0
def test_where_timedelta_coerce():
    s = Series([1, 2], dtype='timedelta64[ns]')
    expected = Series([10, 10])
    mask = np.array([False, False])

    rs = s.where(mask, [10, 10])
    assert_series_equal(rs, expected)

    rs = s.where(mask, 10)
    assert_series_equal(rs, expected)

    rs = s.where(mask, 10.0)
    assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, 10.0])
    assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, np.nan])
    expected = Series([10, None], dtype='object')
    assert_series_equal(rs, expected)
 def consecutive_wins_losses(self):
     '''
     Calculates the positive and negative runs in the trade series.
     '''
     trade_df = self.as_dataframe().sort_values(by = 'exit')
     win_loss = sign(trade_df.base_return)
     # Create series which has just 1's and 0's
     positive = Series(hstack(([0], ((win_loss > 0) * 1).values, [0])))
     negative = Series(hstack(([0], ((win_loss < 0) * 1).values, [0])))
     pos_starts = positive.where(positive.diff() > 0)
     pos_starts = Series(pos_starts.dropna().index.tolist())
     pos_ends = positive.where(positive.diff() < 0)
     pos_ends = Series(pos_ends.dropna().index.tolist())
     positive_runs = pos_ends - pos_starts
     neg_starts = negative.where(negative.diff() > 0)
     neg_starts = Series(neg_starts.dropna().index.tolist())
     neg_ends = negative.where(negative.diff() < 0)
     neg_ends = Series(neg_ends.dropna().index.tolist())
     negative_runs = neg_ends - neg_starts
     return (positive_runs, negative_runs)
Example #12
0
def test_where_unsafe():
    # see gh-9731
    s = Series(np.arange(10), dtype="int64")
    values = [2.5, 3.5, 4.5, 5.5]

    mask = s > 5
    expected = Series(list(range(6)) + values, dtype="float64")

    s[mask] = values
    tm.assert_series_equal(s, expected)

    # see gh-3235
    s = Series(np.arange(10), dtype="int64")
    mask = s < 5
    s[mask] = range(2, 7)
    expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64")
    tm.assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype="int64")
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64")
    tm.assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    msg = "cannot set using a list-like indexer with a different length than the value"
    with pytest.raises(ValueError, match=msg):
        s[mask] = [5, 4, 3, 2, 1]

    with pytest.raises(ValueError, match=msg):
        s[mask] = [0] * 5

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    tm.assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    tm.assert_series_equal(result, expected)
Example #13
0
def test_where_unsafe():
    # see gh-9731
    s = Series(np.arange(10), dtype="int64")
    values = [2.5, 3.5, 4.5, 5.5]

    mask = s > 5
    expected = Series(lrange(6) + values, dtype="float64")

    s[mask] = values
    assert_series_equal(s, expected)

    # see gh-3235
    s = Series(np.arange(10), dtype='int64')
    mask = s < 5
    s[mask] = lrange(2, 7)
    expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
    assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
    assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    msg = "cannot assign mismatch length to masked array"
    with pytest.raises(ValueError, match=msg):
        s[mask] = [5, 4, 3, 2, 1]

    with pytest.raises(ValueError, match=msg):
        s[mask] = [0] * 5

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    assert_series_equal(result, expected)
Example #14
0
def test_where_datetime_conversion():
    s = Series(date_range("20130102", periods=2))
    expected = Series([10, 10])
    mask = np.array([False, False])

    rs = s.where(mask, [10, 10])
    tm.assert_series_equal(rs, expected)

    rs = s.where(mask, 10)
    tm.assert_series_equal(rs, expected)

    rs = s.where(mask, 10.0)
    tm.assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, 10.0])
    tm.assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, np.nan])
    expected = Series([10, None], dtype="object")
    tm.assert_series_equal(rs, expected)

    # GH 15701
    timestamps = [
        "2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"
    ]
    s = Series([Timestamp(t) for t in timestamps])
    rs = s.where(Series([False, True]))
    expected = Series([pd.NaT, s[1]])
    tm.assert_series_equal(rs, expected)
Example #15
0
def test_where_datetime_conversion():
    s = Series(date_range('20130102', periods=2))
    expected = Series([10, 10])
    mask = np.array([False, False])

    rs = s.where(mask, [10, 10])
    assert_series_equal(rs, expected)

    rs = s.where(mask, 10)
    assert_series_equal(rs, expected)

    rs = s.where(mask, 10.0)
    assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, 10.0])
    assert_series_equal(rs, expected)

    rs = s.where(mask, [10.0, np.nan])
    expected = Series([10, None], dtype='object')
    assert_series_equal(rs, expected)

    # GH 15701
    timestamps = ['2016-12-31 12:00:04+00:00',
                  '2016-12-31 12:00:04.010000+00:00']
    s = Series([pd.Timestamp(t) for t in timestamps])
    rs = s.where(Series([False, True]))
    expected = Series([pd.NaT, s[1]])
    assert_series_equal(rs, expected)
Example #16
0
def test_where_dt_tz_values(tz_naive_fixture):
    ser1 = Series(
        pd.DatetimeIndex(["20150101", "20150102", "20150103"],
                         tz=tz_naive_fixture))
    ser2 = Series(
        pd.DatetimeIndex(["20160514", "20160515", "20160516"],
                         tz=tz_naive_fixture))
    mask = Series([True, True, False])
    result = ser1.where(mask, ser2)
    exp = Series(
        pd.DatetimeIndex(["20150101", "20150102", "20160516"],
                         tz=tz_naive_fixture))
    tm.assert_series_equal(exp, result)
def test_where_with_numeric_data_and_other(data, other):
    # GH 17386
    lower_bound = 1.5

    sparse = SparseSeries(data)
    result = sparse.where(sparse > lower_bound, other)

    dense = Series(data)
    dense_expected = dense.where(dense > lower_bound, other)
    sparse_expected = SparseSeries(dense_expected, fill_value=other)

    tm.assert_series_equal(result, dense_expected)
    tm.assert_sp_series_equal(result, sparse_expected)
    def _drop_duplicated_marker(self,
                                marker_column: pd.Series,
                                start: bool = True):
        """Modify marker column to keep only first start marker or last end
        marker.

        Parameters
        ----------
        marker_column: pd.Series
            Values for which duplicated markers will be removed.
        start: bool, optional
            Indicate which duplicates should be dropped. If True, only first
            start marker is kept. If False, only last end marker is kept.

        Returns
        -------
        dropped: pd.Series

        """

        valid_values = [self.marker_start, self.marker_end]
        denoised = marker_column.where(marker_column.isin(valid_values))

        if start:
            fill = denoised.ffill()
            marker = 1
            shift = 1
        else:
            fill = denoised.bfill()
            marker = 2
            shift = -1

        shifted = fill.shift(shift)
        shifted_start_only = shifted.where(fill.eq(marker))

        mask_drop = (shifted_start_only == marker_column)
        dropped = marker_column.where(~mask_drop)

        return dropped
def test_where_with_numeric_data(data):
    # GH 17386
    lower_bound = 1.5

    sparse = SparseSeries(data)
    result = sparse.where(sparse > lower_bound)

    dense = Series(data)
    dense_expected = dense.where(dense > lower_bound)
    sparse_expected = SparseSeries(dense_expected)

    tm.assert_series_equal(result, dense_expected)
    tm.assert_sp_series_equal(result, sparse_expected)
Example #20
0
def test_where_error():
    s = Series(np.random.randn(5))
    cond = s > 0

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.where(1)
    with pytest.raises(ValueError, match=msg):
        s.where(cond[:3].values, -s)

    # GH 2745
    s = Series([1, 2])
    s[[True, False]] = [0, 1]
    expected = Series([0, 2])
    tm.assert_series_equal(s, expected)

    # failures
    msg = "cannot set using a list-like indexer with a different length than the value"
    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = [0, 2, 3]

    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = []
def test_where_with_bool_data_and_other(other):
    # GH 17386
    data = [False, False, True, True, False, False]
    cond = True

    sparse = SparseSeries(data)
    result = sparse.where(sparse == cond, other)

    dense = Series(data)
    dense_expected = dense.where(dense == cond, other)
    sparse_expected = SparseSeries(dense_expected, fill_value=other)

    tm.assert_series_equal(result, dense_expected)
    tm.assert_sp_series_equal(result, sparse_expected)
def test_where_with_bool_data():
    # GH 17386
    data = [False, False, True, True, False, False]
    cond = True

    sparse = SparseSeries(data)
    result = sparse.where(sparse == cond)

    dense = Series(data)
    dense_expected = dense.where(dense == cond)
    sparse_expected = SparseSeries(dense_expected)

    tm.assert_series_equal(result, dense_expected)
    tm.assert_sp_series_equal(result, sparse_expected)
Example #23
0
    def test_fillna_consistency(self):
        # GH#16402
        # fillna with a tz aware to a tz-naive, should result in object

        ser = Series([Timestamp("20130101"), NaT])

        result = ser.fillna(Timestamp("20130101", tz="US/Eastern"))
        expected = Series(
            [Timestamp("20130101"),
             Timestamp("2013-01-01", tz="US/Eastern")],
            dtype="object",
        )
        tm.assert_series_equal(result, expected)

        msg = "The 'errors' keyword in "
        with tm.assert_produces_warning(FutureWarning, match=msg):
            # where (we ignore the errors=)
            result = ser.where([True, False],
                               Timestamp("20130101", tz="US/Eastern"),
                               errors="ignore")
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning, match=msg):
            result = ser.where([True, False],
                               Timestamp("20130101", tz="US/Eastern"),
                               errors="ignore")
        tm.assert_series_equal(result, expected)

        # with a non-datetime
        result = ser.fillna("foo")
        expected = Series([Timestamp("20130101"), "foo"])
        tm.assert_series_equal(result, expected)

        # assignment
        ser2 = ser.copy()
        ser2[1] = "foo"
        tm.assert_series_equal(ser2, expected)
Example #24
0
def test_where_error():
    s = Series(np.random.randn(5))
    cond = s > 0

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.where(1)
    with pytest.raises(ValueError, match=msg):
        s.where(cond[:3].values, -s)

    # GH 2745
    s = Series([1, 2])
    s[[True, False]] = [0, 1]
    expected = Series([0, 2])
    assert_series_equal(s, expected)

    # failures
    msg = "cannot assign mismatch length to masked array"
    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = [0, 2, 3]
    msg = ("NumPy boolean array indexing assignment cannot assign 0 input"
           " values to the 1 output values where the mask is true")
    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = []
Example #25
0
def test_where_error():
    s = Series(np.random.randn(5))
    cond = s > 0

    msg = "Array conditional must be same shape as self"
    with pytest.raises(ValueError, match=msg):
        s.where(1)
    with pytest.raises(ValueError, match=msg):
        s.where(cond[:3].values, -s)

    # GH 2745
    s = Series([1, 2])
    s[[True, False]] = [0, 1]
    expected = Series([0, 2])
    tm.assert_series_equal(s, expected)

    # failures
    msg = "cannot assign mismatch length to masked array"
    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = [0, 2, 3]
    msg = ("NumPy boolean array indexing assignment cannot assign 0 input "
           "values to the 1 output values where the mask is true")
    with pytest.raises(ValueError, match=msg):
        s[[True, False]] = []
Example #26
0
def test_where_numeric_with_string():
    # GH 9280
    s = Series([1, 2, 3])
    w = s.where(s > 1, "X")

    assert not is_integer(w[0])
    assert is_integer(w[1])
    assert is_integer(w[2])
    assert isinstance(w[0], str)
    assert w.dtype == "object"

    w = s.where(s > 1, ["X", "Y", "Z"])
    assert not is_integer(w[0])
    assert is_integer(w[1])
    assert is_integer(w[2])
    assert isinstance(w[0], str)
    assert w.dtype == "object"

    w = s.where(s > 1, np.array(["X", "Y", "Z"]))
    assert not is_integer(w[0])
    assert is_integer(w[1])
    assert is_integer(w[2])
    assert isinstance(w[0], str)
    assert w.dtype == "object"
Example #27
0
def imputation_normal_distribution(log_intensities: pd.Series,
                                   mean_shift=IMPUTATION_MEAN_SHIFT,
                                   std_shrinkage=IMPUTATION_STD_SHRINKAGE):
    """Impute missing log-transformed intensity values of DDA run.

    Parameters
    ----------
    log_intensities: pd.Series
        Series of normally distributed values. Here usually log-transformed
        protein intensities.
    mean_shift: integer, float
        Shift the mean of the log_intensities by factors of their standard
        deviation to the negative.
    std_shrinkage: float
        Value greater than zero by which to shrink (or inflate) the
        standard deviation of the log_intensities.
    """
    np.random.seed(RANDOM_SEED)
    if not isinstance(log_intensities, pd.Series):
        try:
            # array-like, Iterable, dict, or scalar value?
            log_intensities = pd.Series(log_intensities)
            logger.warning("Series created of Iterable.")
        except Exception as e:
            raise ValueError(
                "Plese provided data which is a pandas.Series or an Iterable",
                e)
    if mean_shift < 0:
        raise ValueError(
            "Please specify a positive float as the std.-dev. is non-negative."
        )
    if std_shrinkage <= 0:
        raise ValueError(
            "Please specify a positive float as shrinkage factor for std.-dev."
        )
    if std_shrinkage >= 1:
        logger.warning("Standard Deviation will increase for imputed values.")

    mean = log_intensities.mean()
    std = log_intensities.std()

    mean_shifted = mean - (std * mean_shift)
    std_shrinked = std * std_shrinkage

    return log_intensities.where(log_intensities.notna(),
                                 np.random.normal(mean_shifted, std_shrinked))
    def encode_X(self, X: pd.Series):
        if self.tokenizer is None:
            self.logger.error(
                "Please initial the embedding by Word_Embedding().init_embedding_layer first"
            )
            return None

        # TODO: fix me, this shouldn't happen!!
        X = X.where((pd.notnull(X)), '')
        self.logger.info("X.head={}".format(X.head(5)))
        self.logger.info("X.shape={}".format(X.shape))
        self.logger.info("X.values.shape={}".format(X.values.shape))
        X = X.values.ravel()
        X = self.tokenizer.texts_to_sequences(X)
        self.logger.info("sequance X {}".format(X))
        X = sequence.pad_sequences(X, maxlen=self.max_text_len)
        self.logger.info("padding X {}".format(X))
        return X
Example #29
0
def invalid_type_vector(series: pd.Series, csv_schema, type_field=None):
    """ Provide a Series in a vectorized operation where values with valid datatypes are nulled-out (set to NaN),
        only leaving invalid values. Useful for building error reports.

        Args:
            series: the pd.Series to act on, often provided as a column slice of a DataFrame.
            csv_schema: the schema containing the details about the columns for this file
            type_field: the column name this series represents. Used to lookup type information from the schema,
                        and should be provided if the `series.name` is not populated on the given `series`.

        Returns:
            A new Series with only invalid datatypes remaining from the one provided.
    """
    return series.where(
        valid_type_bool_vector(series,
                               csv_schema,
                               type_field,
                               match_invalid=True))
Example #30
0
def freqdiff_metrics(orig_freqs: pd.Series,
                     synth_freqs: pd.Series,
                     metrics: Optional[List[str]] = None,
                     ) -> pd.Series:
    '''Compute frequency mismatch metrics for two value frequency series.

    :param orig_freqs: Frequencies of values (or their intervals) in the
        original dataframe column.
    :param synth_freqs: Frequencies of values (or their intervals) in the
        matching synthesized column.
    :param metrics: Names of metrics to include. If None, all metrics are
        computed. For a list of metrics, see :func:`frequency_mismatch`.
    :returns: A Series with metric values, with their names in the index.
    '''
    diff = synth_freqs - orig_freqs
    simpson_orig = (orig_freqs ** 2).sum()
    simpson_synth = (synth_freqs ** 2).sum()
    overlap = orig_freqs.where(orig_freqs <= synth_freqs, synth_freqs)
    metric_series = pd.Series({
        'rtae': abs(diff).sum(),
        'overlap_coef': overlap.sum(),
        'rank_damerau': damerau_levenshtein(
            orig_freqs.sort_values().index.tolist(),
            synth_freqs.sort_values().index.tolist(),
        ) / len(orig_freqs.index),
        'morisita_overlap': (
            2 * (orig_freqs * synth_freqs).sum()
            / (simpson_orig + simpson_synth)
        ),
        'mae': abs(diff).mean(),
        'rmse': (diff ** 2).mean() ** .5,
        'jaccard_dist': 1 - overlap.sum(),
        'simpson_diff': simpson_synth - simpson_orig,
        'entropy_diff': (
            (synth_freqs[synth_freqs>0] * np.log(synth_freqs[synth_freqs>0])).sum()
            - (orig_freqs[orig_freqs>0] * np.log(orig_freqs[orig_freqs>0])).sum()
        )
    })
    if metrics is None:
        return metric_series
    else:
        return metric_series[metrics]
Example #31
0
    def test_where_datetimelike_noop(self, dtype):
        # GH#45135, analogue to GH#44181 for Period don't raise on no-op
        # For td64/dt64/dt64tz we already don't raise, but also are
        #  checking that we don't unnecessarily upcast to object.
        ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype)
        df = ser.to_frame()
        mask = np.array([False, False, False])

        res = ser.where(~mask, "foo")
        tm.assert_series_equal(res, ser)

        mask2 = mask.reshape(-1, 1)
        res2 = df.where(~mask2, "foo")
        tm.assert_frame_equal(res2, df)

        res3 = ser.mask(mask, "foo")
        tm.assert_series_equal(res3, ser)

        res4 = df.mask(mask2, "foo")
        tm.assert_frame_equal(res4, df)
Example #32
0
def daily_growth_rate(series: pd.Series, **kwargs):
    PERIOD = 7
    THRESHOLD = 10  # minimum cases per day on average

    # growth rate
    series = series.rolling(PERIOD).mean()
    series = series.where(series >= THRESHOLD,
                          other=np.nan)  # ignore small data
    k = np.log(
        series / series.shift(PERIOD)) / PERIOD * 100  # daily growth rate %
    if k.isna().all():
        return None
    fig, ax = plt.subplots()
    line(ax, k, kwargs)
    ax.axhline(0, color='#999999', lw=0.5)
    previous_lfooter = kwargs['lfooter'] if 'lfooter' in kwargs else ''
    kwargs[
        'lfooter'] = f'When daily new cases >= {THRESHOLD}; ' + previous_lfooter
    finalise_plot(ax, **kwargs)
    return None
Example #33
0
def test_broadcast(size, mask, item, box):
    selection = np.resize(mask, size)

    data = np.arange(size, dtype=float)

    # Construct the expected series by taking the source
    # data or item based on the selection
    expected = Series([item if use_item else data[
        i] for i, use_item in enumerate(selection)])

    s = Series(data)
    s[selection] = box(item)
    assert_series_equal(s, expected)

    s = Series(data)
    result = s.where(~selection, box(item))
    assert_series_equal(result, expected)

    s = Series(data)
    result = s.mask(selection, box(item))
    assert_series_equal(result, expected)
Example #34
0
def test_broadcast(size, mask, item, box):
    selection = np.resize(mask, size)

    data = np.arange(size, dtype=float)

    # Construct the expected series by taking the source
    # data or item based on the selection
    expected = Series([item if use_item else data[
        i] for i, use_item in enumerate(selection)])

    s = Series(data)
    s[selection] = box(item)
    assert_series_equal(s, expected)

    s = Series(data)
    result = s.where(~selection, box(item))
    assert_series_equal(result, expected)

    s = Series(data)
    result = s.mask(selection, box(item))
    assert_series_equal(result, expected)
Example #35
0
def rolling_mean_excluding_self(series: pd.Series,
                                window: int = 15) -> pd.Series:
    """ Calculate the rolling hollow_mean() for each element in a series. 
        Note: negative items in the series will be treated as zero.
        Note: at the head of the series, items up until window/2 are
              returned as the the rolling_mean
        Note: at the tail of the series, them same mean is returned for 
              the final window/2 items
        Arguments:
        - series - pandas Series - the series for which the 
            hollow-mean will be calculated for each item.
        - window - the size of the rolling window used for calculating  
            the hollow_mean()
        Returns: a series of means"""

    positive = series.where(series >= 0, other=0)
    mean = positive.rolling(window, center=True).apply(hollow_mean)
    for n in range(int(window / 2) + 1, window):
        position = -(window - n)
        mean.iloc[position] = hollow_mean(positive[-window:], n)
    return mean
Example #36
0
def keepTopN(column: pandas.Series,
             n: int,
             default: Optional[object] = None) -> pandas.Series:
    """
	Keeps the top n most popular values of a Series, while replacing the rest with `default`
	
	Args:
		column (pandas.Series): Series to operate on
		n (int): How many values to keep
		default (object, optional): Defaults to NaN. Value with which to replace remaining values
	
	Returns:
		pandas.Series: Series with the most popular n values
	"""

    if default is None: default = numpy.nan

    val_counts = column.value_counts()
    if n > len(val_counts): n = len(val_counts)
    top_n = list(val_counts[:n].index)
    return (column.where(column.isin(top_n), other=default))
Example #37
0
def convert_time_to_most_suitable_unit(arr):
    from pandas import Series
    from numpy import array, isnat, diff, NaN, nanmedian

    # test if dates
    arr = array(arr)
    if isnat(arr).any():
        return arr

    # convert to datetime[ns] floats
    time = array(arr).astype("datetime64[ns]").astype(float)

    # get the difference between time steps
    delta_time = diff(time)

    # approximate the best unit (without losing info)
    time_denominators = dict(ns=1, s=1e9, m=60, h=60, D=24, M=30, Y=12)

    dt_as_frac_of_unit = Series(index=time_denominators.keys())
    denominator = 1
    for key in time_denominators:
        denominator *= time_denominators[key]
        frac = nanmedian(delta_time / denominator)
        # only units that will not lose time are kept
        dt_as_frac_of_unit[key] = frac if frac >= 1 else NaN

    # if the difference is not near enough the unit, exclude it
    # e.g. 35 day interval will eliminate Month as a unit
    if not ((dt_as_frac_of_unit - 1) < 0.05).any():
        dt_as_frac_of_unit = dt_as_frac_of_unit.where(lambda a: (a - 1) >= 1)
    unit = dt_as_frac_of_unit.idxmin()

    # convert time units to appropriate units
    # dtype: datetime64 must be attached to unit
    # must be float when astype(float) is applied
    time_converted = arr.astype(f"datetime64[{unit}]")

    return time_converted
Example #38
0
    def test_fillna_float_casting(self, dtype, fill_type, scalar):
        # GH-43424
        ser = Series([np.nan, 1.2], dtype=dtype)
        fill_values = Series([2, 2], dtype=fill_type)
        if scalar:
            fill_values = fill_values.dtype.type(2)

        result = ser.fillna(fill_values)
        expected = Series([2.0, 1.2], dtype=dtype)
        tm.assert_series_equal(result, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        mask = ser.isna().to_numpy()
        ser[mask] = fill_values
        tm.assert_series_equal(ser, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        ser.mask(mask, fill_values, inplace=True)
        tm.assert_series_equal(ser, expected)

        ser = Series([np.nan, 1.2], dtype=dtype)
        res = ser.where(~mask, fill_values)
        tm.assert_series_equal(res, expected)
Example #39
0
def custom_series_function(ser: pd.Series,
                           within: int) -> pd.core.series.Series:
    """A more challenging mask to apply.
    When passed a series of floats, return all values
        within the given rage of:
         - the minimum value
         - the 1st quartile value
         - the second quartile value
         - the mean
         - the third quartile value
         - the maximum value
    You may want to brush up on some simple statistics to help you here.
    Also, the series is passed to you sorted assending.
        Be sure that you don't return values out of sequence.

    So, for example if you mean is 5.0 and within is 0.1
        return all value between 4.9 and 5.1 inclusive

    :param ser: Series to perform operation on
    :param within: The value to calculate the range of number within
    """
    stat = ser.describe()

    def checkem(x):
        tst = None
        for k, st in stat.items():
            if k in ['count', 'std']:
                continue
            # print(f'{k}: {st-within:.2}——{st+within:.2}')
            if tst is None:
                tst = ((st - within) <= x) & (x <= (st + within))
            else:
                tst = tst | (((st - within) <= x) & (x <= (st + within)))
        return tst

    res = ser.where(checkem(ser)).dropna()
    return res
Example #40
0
def sec_kill(p_v: pd.Series, y0: int) -> pd.Series:
    """
    根据我方出射点坐标速度,判断对方简单反弹时是否可以秒杀
    :param p_v:pd.Series,出射点坐标速度
    :param y0:int,出射点坐标坐标
    :return: pd.Series  元素为bool类型
    >>> sec_kill(pd.Series([-1000, 400]), 10)
    0    False
    1    False
    dtype: bool
    """
    # 已完成测试
    # 镜像点坐标。Y:pd.Series
    Y = y0 + STEP * p_v
    # 把镜像点Y转化为真实点,然后求合法速度区间
    y, count = mirror2real(Y)
    # 对方竖直速度范围 v_range:tuple 元素为四个Series
    v_range = ball_v_range(y)
    # 返回True or False,True表示会被秒杀,False表示不会
    op_v = p_v.where(count % 2 == 0,
                     -p_v)  # op_v = p_v if (count % 2 == 0) else (- p_v)
    # 经测试,逻辑连接符可以用&|-表示与或非,但是似乎不可以用and,or,not。注意优先级不同,必须加括号
    return -(((op_v >= v_range[3]) & (op_v <= v_range[2])) |
             ((op_v >= v_range[1]) & (op_v <= v_range[0])))
Example #41
0
def five_day_on_five_day(series: pd.Series, **kwargs):
    PERIOD = 5
    THRESHOLD = 20  # minimum cases per day on average

    # growth rate
    series = series.rolling(PERIOD).mean()
    series = series.where(series >= THRESHOLD,
                          other=np.nan)  # ignore small data
    growth = series / series.shift(PERIOD)

    if growth.isna().all():
        return None

    # plot
    fig, ax = plt.subplots()
    line(ax, growth, kwargs)
    ax.axhline(1, color='#999999', lw=0.5)
    previous_lfooter = kwargs['lfooter'] if 'lfooter' in kwargs else ''
    kwargs['lfooter'] = (
        f'When daily new cases >= {THRESHOLD}; ' +
        f'Latest datapoint: {growth.dropna().iloc[-1]:.2f}; ' +
        previous_lfooter)
    finalise_plot(ax, **kwargs)
    return None
Example #42
0
def test_where_broadcast():
    # Test a variety of differently sized series
    for size in range(2, 6):
        # Test a variety of boolean indices
        for selection in [
                # First element should be set
                np.resize([True, False, False, False, False], size),
                # Set alternating elements]
                np.resize([True, False], size),
                # No element should be set
                np.resize([False], size)
        ]:

            # Test a variety of different numbers as content
            for item in [
                    2.0, np.nan,
                    np.finfo(np.float).max,
                    np.finfo(np.float).min
            ]:
                # Test numpy arrays, lists and tuples as the input to be
                # broadcast
                for arr in [np.array([item]), [item], (item, )]:
                    data = np.arange(size, dtype=float)
                    s = Series(data)
                    s[selection] = arr
                    # Construct the expected series by taking the source
                    # data or item based on the selection
                    expected = Series([
                        item if use_item else data[i]
                        for i, use_item in enumerate(selection)
                    ])
                    assert_series_equal(s, expected)

                    s = Series(data)
                    result = s.where(~selection, arr)
                    assert_series_equal(result, expected)
Example #43
0
def test_where_ndframe_align():
    msg = "Array conditional must be same shape as self"
    s = Series([1, 2, 3])

    cond = [True]
    with pytest.raises(ValueError, match=msg):
        s.where(cond)

    expected = Series([1, np.nan, np.nan])

    out = s.where(Series(cond))
    tm.assert_series_equal(out, expected)

    cond = np.array([False, True, False, True])
    with pytest.raises(ValueError, match=msg):
        s.where(cond)

    expected = Series([np.nan, 2, np.nan])

    out = s.where(Series(cond))
    tm.assert_series_equal(out, expected)
Example #44
0
def test_where_ndframe_align():
    msg = "Array conditional must be same shape as self"
    s = Series([1, 2, 3])

    cond = [True]
    with pytest.raises(ValueError, match=msg):
        s.where(cond)

    expected = Series([1, np.nan, np.nan])

    out = s.where(Series(cond))
    tm.assert_series_equal(out, expected)

    cond = np.array([False, True, False, True])
    with pytest.raises(ValueError, match=msg):
        s.where(cond)

    expected = Series([np.nan, 2, np.nan])

    out = s.where(Series(cond))
    tm.assert_series_equal(out, expected)
Example #45
0
 def test_where_new_category_raises(self):
     ser = Series(Categorical(["a", "b", "c"]))
     msg = "Cannot setitem on a Categorical with a new category"
     with pytest.raises(ValueError, match=msg):
         ser.where([True, False, True], "d")
Example #46
0
def test_where_unsafe():
    # unsafe dtype changes
    for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16,
                  np.float32, np.float64]:
        s = Series(np.arange(10), dtype=dtype)
        mask = s < 5
        s[mask] = lrange(2, 7)
        expected = Series(lrange(2, 7) + lrange(5, 10), dtype=dtype)
        assert_series_equal(s, expected)
        assert s.dtype == expected.dtype

    # these are allowed operations, but are upcasted
    for dtype in [np.int64, np.float64]:
        s = Series(np.arange(10), dtype=dtype)
        mask = s < 5
        values = [2.5, 3.5, 4.5, 5.5, 6.5]
        s[mask] = values
        expected = Series(values + lrange(5, 10), dtype='float64')
        assert_series_equal(s, expected)
        assert s.dtype == expected.dtype

    # GH 9731
    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    values = [2.5, 3.5, 4.5, 5.5]
    s[mask] = values
    expected = Series(lrange(6) + values, dtype='float64')
    assert_series_equal(s, expected)

    # can't do these as we are forced to change the itemsize of the input
    # to something we cannot
    for dtype in [np.int8, np.int16, np.int32, np.float16, np.float32]:
        s = Series(np.arange(10), dtype=dtype)
        mask = s < 5
        values = [2.5, 3.5, 4.5, 5.5, 6.5]
        pytest.raises(Exception, s.__setitem__, tuple(mask), values)

    # GH3235
    s = Series(np.arange(10), dtype='int64')
    mask = s < 5
    s[mask] = lrange(2, 7)
    expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
    assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
    assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    def f():
        s[mask] = [5, 4, 3, 2, 1]

    pytest.raises(ValueError, f)

    def f():
        s[mask] = [0] * 5

    pytest.raises(ValueError, f)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    assert_series_equal(result, expected)