def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame(
            {"A": np.random.randn(5), "B": np.random.randn(5), "C": np.random.randn(5), "D": ["a", "b", "c", "d", "e"]}
        )

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop("C", axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame(
            {"A": np.random.randn(5), "B": np.random.randn(5), "C": np.random.randn(5), "D": ["a", "b", "c", "d", "e"]}
        )
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=["A", "B", "C"], how="all")
        expected.columns = ["A", "A", "B", "C"]

        df.columns = ["A", "A", "B", "C"]

        result = df.dropna(subset=["A", "C"], how="all")
        assert_frame_equal(result, expected)
Example #2
0
    def test_sort_index_multicolumn(self):
        import random

        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["A", "B"])
        result = frame.sort_values(by=["A", "B"])
        indexer = np.lexsort((frame["B"], frame["A"]))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["A", "B"], ascending=False)
        result = frame.sort_values(by=["A", "B"], ascending=False)
        indexer = np.lexsort((frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["B", "A"])
        result = frame.sort_values(by=["B", "A"])
        indexer = np.lexsort((frame["A"], frame["B"]))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)
Example #3
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=["A", "B"], ascending=[1, 0])
        result = df.sort_values(by=["A", "B"], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(["A", "B"])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf["C"].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected["C"])
Example #4
0
col = data[3]
print(col[np.abs(col) > 3])

print(data[(np.abs(data) > 3).any(1)])

data[np.abs(data) > 3] = np.sign(data) * 3
print(np.sign(data) * 3)
print(data[np.abs(data) > 3])
print(np.abs(data))
print(data.describe())

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
print(sampler)
print(df)
print(df.take(sampler))
print(df.take(np.random.permutation(len(df))[:3]))

bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
print(sampler)
draws = bag.take(sampler)
print(draws)

df = DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
print(df)
print(pd.get_dummies(df["key"]))

dummies = pd.get_dummies(df["key"], prefix="key")
print(dummies)
df_with_dummy = df[["data1"]].join(dummies)
Example #5
0
# transform
# help(pd.Series.map)

# replace values
data = Series([1.0, -999.0, 2.0, -999.0, -1000.0, 3.0])
data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])  # data.replace({-999:np.nan,-1000:0})

# permutation and random sampling
df = DataFrame(np.arange(20).reshape(5, 4))
sampler = np.random.permutation(5)
df.take(sampler)
df.take(sampler[:3])
df.take(np.random.permutation(len(df))[:3])  # sample without replacement (slow but works)
sampler = np.random.randint(0, len(df), size=10)
df.take(sampler)  # sample with replacement

# get dummy variables for categorical variable
df = DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "value": range(6)})
dummies = pd.get_dummies(df["key"], prefix="key")
df_with_dummy = df[["value"]].join(dummies)  # type(df['value']) and type(df[['value']])

# string manipulation
"""
startwith, endwith, split, strip, count
index, find, rfind, replace, join,  
upper, lower, ljust, rjust, lstrip, rstrip
"""
Example #6
0
cats.categories
pd.value_counts(cats)

data = np.random.rand(20)
pd.cut(data, 4, precision=2)  # input the number of the bins

data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
pd.value_counts(cats)
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.0])

# Permutation and random sampling
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))

df.take(np.random.permutation(5))
df.take(np.random.permutation(len(df))[:3])

# Computing indicator / dummy variables
df = DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
pd.get_dummies(df["key"])

# String manipulation==========================================================
val = "a,b,  guido"
val.split(",")
pieces = [x.strip() for x in val.split(",")]
pieces
"::".join(pieces)
"guido" in val
val.index(",")
val.find(":")  # return -1 when not find
val.index(":")