Beispiel #1
0
    def test_pad_nan(self):
        x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float)

        x.fillna(method="pad", inplace=True)

        expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float)
        assert_series_equal(x[1:], expected[1:])
        self.assertTrue(np.isnan(x[0]), np.isnan(expected[0]))
Beispiel #2
0
    def test_fillna_bug(self):
        x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"])
        filled = x.fillna(method="ffill")
        expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index)
        assert_series_equal(filled, expected)

        filled = x.fillna(method="bfill")
        expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index)
        assert_series_equal(filled, expected)
Beispiel #3
0
    def test_fillna(self):
        ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))

        self.assert_(np.array_equal(ts, ts.fillna()))

        ts[2] = np.NaN

        self.assert_(np.array_equal(ts.fillna(), [0.0, 1.0, 1.0, 3.0, 4.0]))
        self.assert_(np.array_equal(ts.fillna(method="backfill"), [0.0, 1.0, 3.0, 3.0, 4.0]))

        self.assert_(np.array_equal(ts.fillna(value=5), [0.0, 1.0, 5.0, 3.0, 4.0]))
Beispiel #4
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        s = Series([0, 1, np.nan, 3, 4, 5])

        exp = s.fillna(0).add(2)
        res = s.add(2, fill_value=0)
        assert_series_equal(res, exp)
Beispiel #5
0
    def test_endswith(self):
        values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"])

        result = values.str.endswith("foo")
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]
        rs = strings.str_endswith(mixed, "f")
        xp = [False, NA, False, NA, NA, False, NA, NA, NA]
        tm.assert_almost_equal(rs, xp)

        rs = Series(mixed).str.endswith("f")
        self.assert_(isinstance(rs, Series))
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u"om", NA, u"foo_nom", u"nom", u"bar_foo", NA, u"foo"])

        result = values.str.endswith("foo")
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        result = values.str.endswith("foo", na=False)
        tm.assert_series_equal(result, exp.fillna(False).astype(bool))
def count_enf_born(info_child, index):
    info_child["enf_born"] = (info_child["age_enf"] >= 0) * info_child["nb_enf"]
    info = info_child.groupby(["id_parent"])["enf_born"].sum().reset_index()
    info.columns = ["id_parent", "nb_born"]
    info.index = info["id_parent"]
    nb_born = Series(zeros(len(index)), index=index)
    nb_born += info["nb_born"]
    return nb_born.fillna(0)
Beispiel #7
0
    def test_fillna_inplace(self):
        x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"])
        y = x.copy()

        y.fillna(value=0, inplace=True)

        expected = x.fillna(value=0)
        assert_series_equal(y, expected)
Beispiel #8
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype="M8[us]")

        filled = series.fillna(method="pad")
        filled2 = series.fillna(value=series[2])

        expected = series.copy()
        expected[3] = expected[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="pad")
        filled2 = df.fillna(value=series[2])
        expected = DataFrame({"A": expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Beispiel #9
0
    def test_datetime64_fillna(self):

        s = Series([Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01")])
        s[2] = np.nan

        # reg fillna
        result = s.fillna(Timestamp("20130104"))
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130104"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        from pandas import tslib

        result = s.fillna(tslib.NaT)
        expected = s
        assert_series_equal(result, expected)

        # ffill
        result = s.ffill()
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        # bfill
        result = s.bfill()
        expected = Series(
            [Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01"), Timestamp("20130103 9:01:01")]
        )
        assert_series_equal(result, expected)

        # GH 6587
        # make sure that we are treating as integer when filling
        # this also tests inference of a datetime-like with NaT's
        s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"])
        expected = Series(
            ["2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001"], dtype="M8[ns]"
        )
        result = s.fillna(method="backfill")
        assert_series_equal(result, expected)
Beispiel #10
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(["aardvark", "artichoke", np.nan, "avocado"])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan  # not same

    # Exclude N/A
    print "", ""
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([[1.0, 6.5, 3.0], [1.0, NA, NA], [NA, NA, NA], [NA, 6.5, 3.0]])
    cleaned = data.dropna()  # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how="all")
    data[4] = None
    print data.dropna(axis=1, how="all")
    print data.dropna(thresh=2)  # non NA is more 2

    # Fill NA
    print "", ""
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print "", ""
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA
    df.ix[4:, 2] = NA
    print df
    print df.fillna(method="ffill")
    print df.fillna(method="ffill", limit=2)
    data = Series([1.0, NA, 3.5, NA, 7])
    print data.fillna(data.mean())
Beispiel #11
0
    def test_comparison_operators_with_nas(self):
        s = Series(bdate_range("1/1/2000", periods=10), dtype=object)
        s[::2] = np.nan

        # test that comparisons work
        ops = ["lt", "le", "gt", "ge", "eq", "ne"]
        for op in ops:
            val = s[5]

            f = getattr(operator, op)
            result = f(s, val)

            expected = f(s.dropna(), val).reindex(s.index)

            if op == "ne":
                expected = expected.fillna(True).astype(bool)
            else:
                expected = expected.fillna(False).astype(bool)

            assert_series_equal(result, expected)

            # fffffffuuuuuuuuuuuu
            # result = f(val, s)
            # expected = f(val, s.dropna()).reindex(s.index)
            # assert_series_equal(result, expected)

            # boolean &, |, ^ should work with object arrays and propagate NAs

        ops = ["and_", "or_", "xor"]
        mask = s.isnull()
        for bool_op in ops:
            f = getattr(operator, bool_op)

            filled = s.fillna(s[0])

            result = f(s < s[9], s > s[3])

            expected = f(filled < filled[9], filled > filled[3])
            expected[mask] = False
            assert_series_equal(result, expected)
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index

    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))

    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))

    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))

    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))

    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
__author__ = "ryu"

import numpy as np
import pandas as pd
from numpy import nan as NA
from pandas import Series, DataFrame

string_data = Series(["aardvark", "artichoke", np.nan, "avocado"])
string_data[0] = None
print string_data.isnull()
string_data.fillna(0)

data = DataFrame([[1.0, 6.5, 3.0], [1.0, NA, NA], [NA, NA, NA], [NA, 6.5, 3.0]])
data.dropna()
data.dropna(axis=1, how="all")
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA
df.dropna(thresh=2)  # At least how many non NA values

df.fillna({1: 0.5, 3: -1})
df.fillna(method="bfill", limit=2)


data = Series(
    np.random.randn(10), index=[["a", "a", "a", "b", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]
)
print data.index
print data["b"], data["b":"c"], data.ix[["b", "d"]], data[:, 2]
print data.unstack()

Beispiel #14
0
 def test_bfill(self):
     ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))
     ts[2] = np.NaN
     assert_series_equal(ts.bfill(), ts.fillna(method="bfill"))
Beispiel #15
0
# 处理缺失数据
package = Series(["pandas", "numpy", "matplotlib", np.nan])
print package.isnull()
# python内置的None值也会被当做NA处理

data = Series([1, np.nan, 2, np.nan, 3])
print data.dropna()
print data[data.notnull()]

# 对于DataFrame,dropna默认丢弃任何含有缺失值的行
data = DataFrame(np.arange(16).reshape(4, 4))
data[(data / 2 - 1) % 3 == 0] = np.nan
# 将第一行全部置为nan
data.ix[0] = np.nan
print data.dropna()
# 只丢弃全为NA的那些行
print data.dropna(how="all")
# 第三列置为NA
data[3] = np.nan
# axis = 1将丢弃列
print data.dropna(axis=1, how="all")

# DataFrame中切片居然含有结束元素?
print data.ix[:4, 1], data[0:4]
# 留下一部分数据观察,留下三列
print data.dropna(thresh=3)

# fillna来为缺失值赋值
print data.fillna(0)
# 通过索引来复制
print data.fillna({1: 2, 2: 2.1})
Beispiel #16
0
    def test_fillna(self):
        ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))

        self.assert_numpy_array_equal(ts, ts.fillna(method="ffill"))

        ts[2] = np.NaN

        self.assert_numpy_array_equal(ts.fillna(method="ffill"), [0.0, 1.0, 1.0, 3.0, 4.0])
        self.assert_numpy_array_equal(ts.fillna(method="backfill"), [0.0, 1.0, 3.0, 3.0, 4.0])

        self.assert_numpy_array_equal(ts.fillna(value=5), [0.0, 1.0, 5.0, 3.0, 4.0])

        self.assertRaises(ValueError, ts.fillna)
        self.assertRaises(ValueError, self.ts.fillna, value=0, method="ffill")

        # GH 5703
        s1 = Series([np.nan])
        s2 = Series([1])
        result = s1.fillna(s2)
        expected = Series([1.0])
        assert_series_equal(result, expected)
        result = s1.fillna({})
        assert_series_equal(result, s1)
        result = s1.fillna(Series(()))
        assert_series_equal(result, s1)
        result = s2.fillna(s1)
        assert_series_equal(result, s2)
        result = s1.fillna({0: 1})
        assert_series_equal(result, expected)
        result = s1.fillna({1: 1})
        assert_series_equal(result, Series([np.nan]))
        result = s1.fillna({0: 1, 1: 1})
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}))
        assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
        assert_series_equal(result, s1)

        s1 = Series([0, 1, 2], list("abc"))
        s2 = Series([0, np.nan, 2], list("bac"))
        result = s2.fillna(s1)
        expected = Series([0, 0, 2.0], list("bac"))
        assert_series_equal(result, expected)

        # limit
        s = Series(np.nan, index=[0, 1, 2])
        result = s.fillna(999, limit=1)
        expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        result = s.fillna(999, limit=2)
        expected = Series([999, 999, np.nan], index=[0, 1, 2])
        assert_series_equal(result, expected)

        # GH 9043
        # make sure a string representation of int/float values can be filled
        # correctly without raising errors or being converted
        vals = ["0", "1.5", "-0.3"]
        for val in vals:
            s = Series([0, 1, np.nan, np.nan, 4], dtype="float64")
            result = s.fillna(val)
            expected = Series([0, 1, val, val, 4], dtype="object")
            assert_series_equal(result, expected)
Beispiel #17
0
 def test_fillna_int(self):
     s = Series(np.random.randint(-100, 100, 50))
     s.fillna(method="ffill", inplace=True)
     assert_series_equal(s.fillna(method="ffill", inplace=False), s)
Beispiel #18
0
    def test_datetime64_tz_fillna(self):
        for tz in ["US/Eastern", "Asia/Tokyo"]:
            # DatetimeBlock
            s = Series([Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-01-03 10:00"), pd.NaT])
            result = s.fillna(pd.Timestamp("2011-01-02 10:00"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-02 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna("AAA")
            expected = Series(
                [Timestamp("2011-01-01 10:00"), "AAA", Timestamp("2011-01-03 10:00"), "AAA"], dtype=object
            )
            self.assert_series_equal(expected, result)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)

            # DatetimeBlockTZ
            idx = pd.DatetimeIndex(["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz)
            s = pd.Series(idx)
            result = s.fillna(pd.Timestamp("2011-01-02 10:00"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz))
            idx = pd.DatetimeIndex(
                ["2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00"], tz=tz
            )
            expected = Series(idx)
            self.assert_series_equal(expected, result)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime())
            idx = pd.DatetimeIndex(
                ["2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00"], tz=tz
            )
            expected = Series(idx)
            self.assert_series_equal(expected, result)

            result = s.fillna("AAA")
            expected = Series(
                [Timestamp("2011-01-01 10:00", tz=tz), "AAA", Timestamp("2011-01-03 10:00", tz=tz), "AAA"], dtype=object
            )
            self.assert_series_equal(expected, result)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00", tz=tz)})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-04 10:00", tz=tz),
                ]
            )
            self.assert_series_equal(expected, result)

            # filling with a naive/other zone, coerce to object
            result = s.fillna(Timestamp("20130101"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2013-01-01"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2013-01-01"),
                ]
            )
            self.assert_series_equal(expected, result)

            result = s.fillna(Timestamp("20130101", tz="US/Pacific"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2013-01-01", tz="US/Pacific"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2013-01-01", tz="US/Pacific"),
                ]
            )
            self.assert_series_equal(expected, result)
Beispiel #19
0
summ["mean"]

s1 = Series(arange(1.0, 6), index=["a", "a", "b", "c", "d"])
s1
s1.drop("a")

s1 = Series(arange(1.0, 4.0), index=["a", "b", "c"])
s2 = Series(arange(1.0, 4.0), index=["c", "d", "e"])
s3 = s1 + s2
s3
s3.dropna()

s1 = Series(arange(1.0, 4.0), index=["a", "b", "c"])
s2 = Series(arange(1.0, 4.0), index=["c", "d", "e"])
s3 = s1 + s2
s3.fillna(-1.0)

df = DataFrame(array([[1, 2], [3, 4]]))
df

df = DataFrame(array([[1, 2], [3, 4]]), columns=["a", "b"])
df
df = DataFrame(array([[1, 2], [3, 4]]))
df.columns = ["dogs", "cats"]
df

df = DataFrame(array([[1, 2], [3, 4]]), columns=["dogs", "cats"], index=["Alice", "Bob"])
df

t = dtype([("datetime", "O8"), ("value", "f4")])
x = zeros(1, dtype=t)
Beispiel #20
0
    return {"min": group.min(), "max": group.max(), "count": group.count(), "mean": group.mean()}


grouped = frame.data2.groupby(factor)
print(grouped)

print(grouped.apply(get_stats).unstack())

grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
print(grouped.apply(get_stats).unstack())

s = Series(np.random.randn(6))
s[::2] = np.nan
print(s)
print(s.fillna(s.mean()))

states = ["Ohio", "New York", "Vermont", "Florida", "Oregon", "Nevada", "California", "Idaho"]
grouped_key = ["East"] * 4 + ["West"] * 4
data = Series(np.random.randn(8), index=states)
data[["Vermont", "Nevada", "Idaho"]] = np.nan
print(data)
print(data.groupby(grouped_key).mean())

fill_mean = lambda g: g.fillna(g.mean())

print(data.groupby(grouped_key).apply(fill_mean))

fill_values = {"East": 0.5, "West": -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
print(data.groupby(grouped_key).apply(fill_func))
Beispiel #21
0
class StatFileClass:
    global FinalSummary

    def __init__(self, temp=0):
        self.FileName = temp
        self.FileNameHead = self.FileName[0 : self.FileName.find(".")]
        self.BankName = self.FileName[0 : self.FileName.find("银行") + 2]
        if "美元" in temp:
            self.Currency = "USD"
        elif "日币" in temp:
            self.Currency = "JPY"
        else:
            self.Currency = "CNY"
        if "待核查" in temp:
            self.CountType = "待核查"
        elif "专户" in temp:
            self.CountType = "专户"
        else:
            self.CountType = "一般户"
        self.DateLable = StatRule1.ix[self.BankName, "交易日期字段"]
        self.TimeLable = StatRule1.ix[self.BankName, "交易时间字段"]
        self.TimeFormat = StatRule1.ix[self.BankName, "时间格式"].split(",")
        self.IncomeLable = StatRule1.ix[self.BankName, "收入字段"]
        self.PayLable = StatRule1.ix[self.BankName, "支出字段"]
        self.BalanceLable = StatRule1.ix[self.BankName, "当日余额字段"]
        self.KeyLable1 = StatRule1.ix[self.BankName, "大类字段"]
        self.KeyLable2 = StatRule1.ix[self.BankName, "子类字段"].split("+")  # 字符串list
        self.CountLable = StatRule1.ix[self.BankName, "户名字段"]
        self.SkipRows = StatRule1.ix[self.BankName, "数据开始行"] - 1
        self.ERateUSD = StatRule1.ix[self.BankName, "美元汇率"]
        self.ERateJPY = StatRule1.ix[self.BankName, "日元汇率"]

        # 计算汇率
        if self.Currency == "USD":
            self.ERate = self.ERateUSD
        elif self.Currency == "JPY":
            self.ERate = self.ERateJPY
        else:
            self.ERate = 1

        # 判断收入支出类型
        if self.IncomeLable != self.PayLable:  # 非中国银行,将收入和支出合并
            self.RawData = read_excel(
                self.FileName,
                skiprows=self.SkipRows,
                converters={self.IncomeLable: str, self.PayLable: str, self.BalanceLable: str},
            )
            self.IncomeData = self.RawData[self.IncomeLable].astype(float).fillna(0)
            self.PayData = self.RawData[self.PayLable].astype(float).fillna(0)
            self.IncomeData = self.IncomeData - self.PayData
        else:
            self.RawData = read_excel(
                self.FileName, skiprows=self.SkipRows, converters={self.IncomeLable: str, self.BalanceLable: str}
            )
            self.IncomeData = self.RawData[self.IncomeLable].astype(float)
        self.IncomeType = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        for i in range(self.IncomeData.shape[0]):
            if self.IncomeData[i] > 0:
                self.IncomeType[i] = "收入"
            else:
                self.IncomeType[i] = "支出"

        # 计算本币收入及余额
        self.IncomeDataLocal = self.IncomeData * self.ERate

        # 处理日期数据
        self.Date = self.RawData[self.DateLable].astype(str)  # 字符串直接转换为日期数据
        self.Time = self.RawData[self.TimeLable].astype(str)
        if len(self.TimeFormat) == 3:  # 如果有字符串长度参数则截取
            for i in self.Time.index:
                self.Time[i] = self.Time[i][int(self.TimeFormat[1]) - 1 : int(self.TimeFormat[2])]
        for i in self.Time.index:
            self.Time[i] = datetime.strptime(self.Time[i], self.TimeFormat[0]).strftime(
                "%H:%M:%S"
            )  # 按照格式处理为时间数据,再转化为格式化的字符串
        # 获取大类数据
        self.KeyWord1 = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        if self.KeyLable1 == "无":
            self.KeyWord1[:] = "无"
        else:
            self.KeyWord1 = self.RawData[self.KeyLable1]

        # 获取交易户名数据
        self.CountName = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        self.CountData = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        if self.CountLable == "无":
            self.CountName[:] = "无"
            self.CountData[:] = "无"
        else:
            self.CountData = self.RawData[self.CountLable]
            self.CountData.fillna("无", inplace=True)
            for i in range(self.IncomeData.shape[0]):
                TempData = list(
                    set(
                        list(
                            zip(
                                *list(
                                    ClassifyRuleDF.ix[self.BankName].ix[self.IncomeType[i]].ix[self.KeyWord1[i]].index
                                )
                            )
                        )[0]
                    )
                )  # 获取户名的唯一值的list
                if len(TempData) == 1:  # 子类字段只有一个
                    self.CountName[i] = TempData[0]
                else:
                    bFindResult = False
                    for j in TempData:
                        if j in self.CountData[i]:
                            self.CountName[i] = j
                            bFindResult = True
                            break
                    if not bFindResult:
                        self.CountName[i] = "无"

        # 获取子类数据并分类
        self.KeyWord2 = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        self.KeyData2 = self.RawData[self.KeyLable2]
        self.KeyData2.fillna(" ", inplace=True)  # 由于有多个关键字段,空值不能赋为'无',而是空格
        self.KeyData2 = self.KeyData2.apply(JoinStr, axis=1)
        self.ClassifyResult = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        for i in range(self.KeyData2.shape[0]):
            try:
                TempData = list(
                    set(
                        list(
                            ClassifyRuleDF.ix[self.BankName]
                            .ix[self.IncomeType[i]]
                            .ix[self.KeyWord1[i]]
                            .ix[self.CountName[i]]
                            .index
                        )
                    )
                )
                if len(TempData) == 1:  # 子类字段只有一个
                    self.KeyWord2[i] = TempData[0]
                else:
                    TempKeyWord = [
                        m.split("+") for m in TempData
                    ]  # 按分隔符分割关键字,[['银票托收'],['销售收入'],['BEPS'],['BEPS','网吧']]
                    TempKeyWord.sort(
                        key=lambda x: len(x), reverse=True
                    )  # 按关键字个数排序;关键字越多,排序越靠前,[['BEPS','网吧'],['银票托收'],['销售收入'],['BEPS']]
                    bFindResult = False
                    for j in TempKeyWord:  # j = ['BEPS','网吧']
                        bFindResult2 = True
                        for k in j:  # k = 'BEPS'
                            # if k not in str(list(self.KeyData2.ix[i])):   #只要有一个关键字不匹配,则放弃搜索该关键字
                            if k not in self.KeyData2.ix[i]:
                                bFindResult2 = False
                                break
                        if bFindResult2:  # 全部关键字匹配,则认为匹配成功
                            self.KeyWord2[i] = "+".join(j)  # 用+号重新连接为表中的关键字
                            bFindResult = True
                            break
                    if not bFindResult:
                        self.KeyWord2[i] = "无"
                if self.KeyWord2[i] in TempData:
                    self.ClassifyResult[i] = (
                        ClassifyRuleDF.ix[self.BankName]
                        .ix[self.IncomeType[i]]
                        .ix[self.KeyWord1[i]]
                        .ix[self.CountName[i]]
                        .ix[self.KeyWord2[i]]
                    )
                    if type(self.ClassifyResult[i]) != str:  # 如果出现多个分类结果,取第一个;
                        self.ClassifyResult[i] = self.ClassifyResult[i].ix[0]
                else:
                    self.ClassifyResult[i] = "分类错误"
            except (Exception) as e:
                print(e, ", 分类错误")
                self.ClassifyResult[i] = "分类错误"

        # ==============================================================================
        #         #产生当日分类汇总
        #         self.Summary = self.IncomeData.copy()
        #         self.Summary.index = [self.Date,self.IncomeType,self.ClassifyResult]
        #         TempIndex = set(list(self.Summary.index)) #合并日期、收入类型、分类结果都相同的项
        #         TempIncome = array([self.Summary.ix[i].sum() for i in TempIndex])
        #         self.Summary = DataFrame(TempIncome, columns  = ['原币收入'])
        #         TempIncomeLocal = TempIncome
        #         if self.Currency == 'USD':
        #             TempIncomeLocal = TempIncome * self.ERateUSD
        #         elif self.Currency == 'JPY':
        #             TempIncomeLocal *= TempIncome * self.ERateJPY
        #         self.Summary['本币收入'] = TempIncomeLocal
        #         self.Summary['银行名称'] = self.BankName
        #         self.Summary['账户类型'] = self.CountType
        #         self.Summary['币种'] = self.Currency
        #         TempIndex = list(zip(*list(TempIndex)))
        #         self.Summary['交易日期'] = TempIndex[0]
        #         self.Summary['收支类型'] = TempIndex[1]
        #         self.Summary['分类结果'] = TempIndex[2]
        #
        # ==============================================================================
        # 结果输出
        self.ResultDF = concat(
            [
                self.Date,
                self.Time,
                self.IncomeData,
                self.IncomeDataLocal,
                self.IncomeType,
                self.KeyWord1,
                self.CountName,
                self.KeyWord2,
                self.ClassifyResult,
            ],
            axis=1,
        )
        self.ResultDF.columns = ["交易日期", "交易时间", "收入", "本币收入", "收支类型", "大类", "对方户名", "子类", "分类结果"]
        self.ResultDF["银行名称"] = self.BankName
        self.ResultDF["账户类型"] = self.CountType
        self.ResultDF["币种"] = self.Currency

        self.ResultDF2 = concat(
            [
                self.Date,
                self.Time,
                self.IncomeType,
                self.IncomeData,
                self.IncomeDataLocal,
                self.KeyWord1,
                self.CountData,
                self.KeyData2,
                self.ClassifyResult,
            ],
            axis=1,
        )
        self.ResultDF2.columns = ["交易日期", "交易时间", "收支类型", "交易原币金额", "交易本币金额", "大类", "对方户名", "子类", "分类结果"]
        self.ResultDF2["银行名称"] = self.FileNameHead
        self.ResultDF2["汇率"] = self.ERate
        self.ResultDF2["币种"] = self.Currency
        self.ResultDF2 = self.ResultDF2.reindex(
            columns=["银行名称", "交易日期", "交易时间", "收支类型", "币种", "交易原币金额", "汇率", "交易本币金额", "大类", "子类", "分类结果", "对方户名"]
        )

        # 单日余额汇总
        # 余额数据导入
        self.BalanceData = self.RawData[self.BalanceLable]
        self.BalanceData = DataFrame([self.Date, self.BalanceData], index=["交易日期", "余额"]).T
        self.BalanceData = self.BalanceData.groupby(["交易日期"]).last()  # 取每天的最后一笔交易的余额数据
        self.BalanceData = self.BalanceData.applymap(RemoveComma)  # 对每个元素去除逗号
        self.BalanceData = self.BalanceData.astype(float)
        # 计算本币余额
        self.BalanceDataLocal = self.BalanceData * self.ERate
        self.BalanceData["本币余额"] = self.BalanceDataLocal
        self.BalanceData["银行名称"] = self.BankName
        self.BalanceData["账户类型"] = self.CountType
        self.BalanceData["币种"] = self.Currency
        self.BalanceData["交易日期"] = self.BalanceData.index
        self.BalanceData = self.BalanceData.reindex(columns=["交易日期", "余额", "本币余额", "银行名称", "账户类型", "币种"])
        self.BalanceData.index = self.BalanceData["交易日期"].map(ReturnDate)
Beispiel #22
0
    def test_datetime64_tz_fillna(self):
        for tz in ["US/Eastern", "Asia/Tokyo"]:
            # DatetimeBlock
            s = Series([Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-01-03 10:00"), pd.NaT])
            null_loc = pd.Series([False, True, False, True])

            result = s.fillna(pd.Timestamp("2011-01-02 10:00"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-02 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)
            # check s is not changed
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna("AAA")
            expected = Series(
                [Timestamp("2011-01-01 10:00"), "AAA", Timestamp("2011-01-03 10:00"), "AAA"], dtype=object
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00"),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00"),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            # DatetimeBlockTZ
            idx = pd.DatetimeIndex(["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz)
            s = pd.Series(idx)
            self.assertEqual(s.dtype, "datetime64[ns, {0}]".format(tz))
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz))
            idx = pd.DatetimeIndex(
                ["2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00"], tz=tz
            )
            expected = Series(idx)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime())
            idx = pd.DatetimeIndex(
                ["2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00"], tz=tz
            )
            expected = Series(idx)
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna("AAA")
            expected = Series(
                [Timestamp("2011-01-01 10:00", tz=tz), "AAA", Timestamp("2011-01-03 10:00", tz=tz), "AAA"], dtype=object
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00")})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-04 10:00"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna({1: pd.Timestamp("2011-01-02 10:00", tz=tz), 3: pd.Timestamp("2011-01-04 10:00", tz=tz)})
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2011-01-02 10:00", tz=tz),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2011-01-04 10:00", tz=tz),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            # filling with a naive/other zone, coerce to object
            result = s.fillna(Timestamp("20130101"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2013-01-01"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2013-01-01"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)

            result = s.fillna(Timestamp("20130101", tz="US/Pacific"))
            expected = Series(
                [
                    Timestamp("2011-01-01 10:00", tz=tz),
                    Timestamp("2013-01-01", tz="US/Pacific"),
                    Timestamp("2011-01-03 10:00", tz=tz),
                    Timestamp("2013-01-01", tz="US/Pacific"),
                ]
            )
            self.assert_series_equal(expected, result)
            self.assert_series_equal(pd.isnull(s), null_loc)
                    is True
                )
            except:
                assert all(original_data[sitecolumn] == tbl_subq_df["spatial_replication_level_1"]) is True

            print("site levels matched:", z)
            metadata_dict["site_levels"] = "pass"
        except Exception as e:
            print("site levels mismatched:", z)
            metadata_dict["site_levels"] = "fail"

        og_obs_data = Series(original_data[obs_dict["unitobs"][1][0]].copy())
        qu_obs_data = tbl_subq_df[data_type.replace("table", "observation")].copy()

        try:
            assert (og_obs_data.fillna("NaN", inplace=True) == qu_obs_data.fillna("NaN", inplace=True)) is True
            print("observations matched:", z)
            metadata_dict["observations"] = "pass"
        except Exception as e:
            print("observations mismatched:", z)
            metadata_dict["observations"] = "fail"

        taxa_info = dict(zip(list(taxa_dict.keys()), [x[1][0] for x in list(taxa_dict.values())]))

        # One last go for replacing values that could be mismatches
        original_data.replace({"NaN": "NA"}, inplace=True)
        original_data.replace({"-99999": "NA"}, inplace=True)
        original_data.replace({"-9999": "NA"}, inplace=True)
        original_data.replace({-99999: "NA"}, inplace=True)
        original_data.replace({-9999: "NA"}, inplace=True)
        original_data.fillna("NA", inplace=True)
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA

print(df)
print("\n")
print(df.fillna(method="ffill"))
print("\n")
print(df.fillna(method="ffill", limit=2))
print("\n")

###############################################################

data = Series([1.0, NA, 3.5, NA, 7])

print(data.fillna(data.mean()))
print("\n")

###############################################################

data = Series(
    np.random.randn(10), index=[["a", "a", "a", "b", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]
)


print(data)
print("\n")
print(data.index)
print("\n")
print(data["b"])
print("\n")