Example #1
0
    def test_describe_no_numeric(self):
        df = DataFrame({"A": ["foo", "foo", "bar"] * 8, "B": ["a", "b", "c", "d"] * 6})
        desc = df.describe()
        expected = DataFrame(dict((k, v.describe()) for k, v in compat.iteritems(df)), columns=df.columns)
        assert_frame_equal(desc, expected)

        ts = tm.makeTimeSeries()
        df = DataFrame({"time": ts.index})
        desc = df.describe()
        self.assertEqual(desc.time["first"], min(ts.index))
Example #2
0
    def test_describe_objects(self):
        df = DataFrame({"C1": ["a", "a", "c"], "C2": ["d", "d", "f"]})
        result = df.describe()
        expected = DataFrame({"C1": [3, 2, "a", 2], "C2": [3, 2, "d", 2]}, index=["count", "unique", "top", "freq"])
        assert_frame_equal(result, expected)

        df = DataFrame({"C1": pd.date_range("2010-01-01", periods=4, freq="D")})
        df.loc[4] = pd.Timestamp("2010-01-04")
        result = df.describe()
        expected = DataFrame(
            {"C1": [5, 4, pd.Timestamp("2010-01-01"), pd.Timestamp("2010-01-04"), pd.Timestamp("2010-01-04"), 2]},
            index=["count", "unique", "first", "last", "top", "freq"],
        )
        assert_frame_equal(result, expected)

        # mix time and str
        df["C2"] = ["a", "a", "b", "c", "a"]
        result = df.describe()
        # when mix of dateimte / obj the index gets reordered.
        expected["C2"] = [5, 3, np.nan, np.nan, "a", 3]
        assert_frame_equal(result, expected)

        # just str
        expected = DataFrame({"C2": [5, 3, "a", 4]}, index=["count", "unique", "top", "freq"])
        result = df[["C2"]].describe()

        # mix of time, str, numeric
        df["C3"] = [2, 4, 6, 8, 2]
        result = df.describe()
        expected = DataFrame(
            {"C3": [5.0, 4.4, 2.607681, 2.0, 2.0, 4.0, 6.0, 8.0]},
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        assert_frame_equal(result, expected)
        assert_frame_equal(df.describe(), df[["C3"]].describe())

        assert_frame_equal(df[["C1", "C3"]].describe(), df[["C3"]].describe())
        assert_frame_equal(df[["C2", "C3"]].describe(), df[["C3"]].describe())
Example #3
0
def edbSave():
    "获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据"

    # 获取客户剪切板中的代码及输入的起始与结束日期
    codes = getCodeFromClipboard()
    start = sDate()
    end = eDate()

    data = w.edb(codes, start, end, "Fill=Previous")
    datachg = [d.strftime("%y-%m-%d") for d in data.Times]
    df = DataFrame(data.Data, index=data.Codes, columns=datachg).T
    print("-" * 85)
    print(df)
    print("-" * 85)
    print("统计指标:")
    print(df.describe())
    print("sum", " " * 3, str(df.sum()).split(sep="    ")[1].rjust(10))
    return df
Example #4
0
# encoding:UTF-8
__author__ = "auroua"
from pandas import Series, DataFrame
from numpy import nan as NA
import numpy as np

df = DataFrame(
    {
        "a": np.arange(7),
        "b": np.arange(7, 0, -1),
        "c": ["one", NA, "one", "two", "two", "two", "two"],
        "d": [0, NA, 2, 3, NA, 1, 2],
    }
)

print df

print df.sum()
print df.sum(axis=1)

print df.describe()

new_df = df.drop("c", axis=1)
new_df = new_df.dropna()
print new_df.corr()
Example #5
0
# データフレームを作る
smp = {
    "state": ["Ohio", "Ohio", "Ohio", "Nebada", "Nebada"],
    "year": [2000, 2001, 2002, 2001, 2002],
    "pop": [1.5, 1.6, 1.7, 3.5, 4.3],
}
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame["year"]  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(smp, index=["one", "two", "three", "four", "five"])  # インデックスを追加
frame2.ix["one"]
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv("stock_px.csv")
print(data)
xlsx_file = pd.ExcelFile("stock_px.xlsx")  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse("stock_px")
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
ds = np.DataSource(None)
f = ds.open("https://dl.dropbox.com/u/956851/game_modified.csv")
d_web = pd.read_csv(f)
print(d_web)
f.close()
print("\n")

data = np.random.rand(20)
print(pd.cut(data, 4, precision=2))
print("\n")

data = np.random.randn(1000)
cats = pd.qcut(data, 4)
print(cats)

print("\n")

print(pd.value_counts(cats))
print(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.0]))
print("\n")
###############################################################

np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
print(data.describe())

print("\n")

col = data[3]
print(col[np.abs(col) > 3])
print("\n")
print(data[(np.abs(data) > 3).any(1)])
print("\n")
data[np.abs(data) > 3] = np.sign(data) * 3
print(data.describe())
print("\n")
Example #7
0
# apply: kind of like R.
f = lambda x: x.max() - x.min()
frame = DataFrame(np.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
# try frame.apply(f), frame.apply(f, axis=1)
formatter = lambda x: "%.2f" % x
frame.applymap(formatter)

# check out assignment.
frametest = frame
frametest["e"] = frametest["e"].map(formatter)
# now look at frame.

# Basic statistics over frames

df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"]
)
# df.sum(), df.sum(axis=1)
df.mean(axis=1)
df.mean(axis=1, skipna=False)
df.describe()  # cool
df.describe()["one"]
test = df.describe()

# Finance data
import pandas.io.data as web

all_data = {}
for ticker in ["AAPL", "IBM", "MSFT", "GOOG"]:
    all_data[ticker] = web.get_data_yahoo(ticker, "1/1/2000", "1/1/2010")
Example #8
0
# Load dataset
# dataset = datasets.load_diabetes()
# Load features and targets
features = dataset.data
targets = dataset.target

np.unique(targets)

# --------------------------------------------------------------
# Normalize data
# --------------------------------------------------------------

# Load into Pandas
features_df = DataFrame(features)
features_stats = features_df.describe()

# Cleanse data

# --------------------------------------------------------------
# Create Test Train datasets
# --------------------------------------------------------------

# Split data into training and test datasets
# A random permutation to split the data randomly
np.random.seed()
indices = np.random.permutation(len(features))

dataset_size = len(features)
test_pct = 0.9
test_size = np.int(round(dataset_size * test_pct, 0))
    columns=[
        int(ser_max / 100),
        int(ser_max / 50),
        int(ser_max / 20),
        int(ser_max / 10),
        int(ser_max / 5),
        int(ser_max / 2),
        ser_max,
    ],
)
for i in periods_test.index:  # Sampling 20 times
    for j in periods_test.columns:
        sample = test.reindex(columns=np.random.permutation(test.columns)[:j])
        periods_test.ix[i, j] = sample.iloc[0].corr(sample.iloc[1])  # ix is for label index, iloc is for int index
print periods_test[:5]
print periods_test.describe()

threshold = 0.1
temp_std = 0
# Take the threshold num which makes sampling correlation stable
for i, std in enumerate(periods_test.std()):
    if std < 0.1 and temp_std >= 0.1:
        mini_period = periods_test.columns[i]
        break
    temp_std = std

# Decide the value of min_periods. Set std 0.05 as threshold
# mini_period = 200
check_size = int(len(data.index) * 0.2)  # 20% dataset for testing
check = {}
check_data = data.copy()  # Avoid the changes on original data
Example #10
0
"""
Student: Max Sorto
Class: IT5090G - Aasheim
Date: 03/31/2016
Assignment: Lab10
"""

from pandas import DataFrame

data = {
    "name": ["Joe", "John", "Mary", "Lee"],
    "quiz 1": [100, 87, 99, 78],
    "quiz 2": [45, 78, 90, 88],
    "assign 1": [98, 82, 93, 78],
    "assign 2": [100, 87, 99, 78],
}

frame = DataFrame(data)
print frame
print "\n"
print frame.describe()
Example #11
0
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
obj.rank(method="first")

obj = Series(range(5), index=["a", "a", "b", "b", "c"])
obj.index.is_unique
df = DataFrame(np.random.randn(4, 3), index=["a", "a", "b", "b"])
df.ix["b"]

df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"]
)
df.sum()
df.sum(axis=0)
df.idxmax()
df.describe()

obj = Series(["a", "a", "b", "c"] * 4)
obj.describe()

import pandas.io.data as web

all_data = {}
for ticker in ["AAPL", "IBM", "MSFT", "GOOG"]:
    all_data[ticker] = web.get_data_yahoo(ticker, "1/1/2000", "1/1/2010")

price = DataFrame({tic: data["Adj Close"] for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data["Volume"] for tic, data in all_data.iteritems()})
returns = price.pct_change()
returns.tail()
    def test_column_dups_operations(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=["A", "A"])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range("20130101", periods=4, freq="Q-NOV")
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"])
        df.columns = idx
        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["foo", "bar", "foo", "hello"])
        df["string"] = "bah"
        expected = DataFrame(
            [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
            columns=["foo", "bar", "foo", "hello", "string"],
        )
        check(df, expected)
        with assertRaisesRegexp(ValueError, "Length of value"):
            df.insert(0, "AnotherColumn", range(len(df.index) - 1))

        # insert same dtype
        df["foo2"] = 3
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # set (non-dup)
        df["foo2"] = 4
        expected = DataFrame(
            [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)
        df["foo2"] = 3

        # delete (non dup)
        del df["bar"]
        expected = DataFrame(
            [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
            columns=["foo", "foo", "hello", "string", "foo2"],
        )
        check(df, expected)

        # try to delete again (its not consolidated)
        del df["hello"]
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"]
        )
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame(
            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"]
        )
        check(df, expected)

        # insert
        df.insert(2, "new_col", 5.0)
        expected = DataFrame(
            [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
            columns=["foo", "foo", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, "cannot insert", df.insert, 2, "new_col", 4.0)
        df.insert(2, "new_col", 4.0, allow_duplicates=True)
        expected = DataFrame(
            [[1, 1, 4.0, 5.0, "bah", 3], [1, 2, 4.0, 5.0, "bah", 3], [2, 3, 4.0, 5.0, "bah", 3]],
            columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
        )
        check(df, expected)

        # delete (dup)
        del df["foo"]
        expected = DataFrame(
            [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
            columns=["new_col", "new_col", "string", "foo2"],
        )
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], columns=["foo", "bar", "foo", "hello"])
        check(df)

        df["foo2"] = 7.0
        expected = DataFrame(
            [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        result = df["foo"]
        expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
        check(result, expected)

        # multiple replacements
        df["foo"] = "string"
        expected = DataFrame(
            [["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0]],
            columns=["foo", "bar", "foo", "hello", "foo2"],
        )
        check(df, expected)

        del df["foo"]
        expected = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {"TClose": [22.02], "RT": [0.0454], "TExg": [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)], names=["STK_ID", "RPT_Date"]),
        )

        df5 = DataFrame(
            {
                "STK_ID": [600809] * 3,
                "RPT_Date": [20120930, 20121231, 20130331],
                "STK_Name": [u("饡驦"), u("饡驦"), u("饡驦")],
                "TClose": [38.05, 41.66, 30.01],
            },
            index=MultiIndex.from_tuples(
                [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=["STK_ID", "RPT_Date"]
            ),
        )

        k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
        result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
        str(result)
        result.dtypes

        expected = DataFrame(
            [[0.0454, 22.02, 0.0422, 20130331, 600809, u("饡驦"), 30.01]],
            columns=["RT", "TClose", "TExg", "RPT_Date", "STK_ID", "STK_Name", "QT_Close"],
        ).set_index(["STK_ID", "RPT_Date"], drop=False)
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"])
        self.assertRaises(ValueError, df.reindex, columns=["bar"])
        self.assertRaises(ValueError, df.reindex, columns=["bar", "foo"])

        # drop
        df = DataFrame([[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"])
        result = df.drop(["a"], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=["bar"])
        check(result, expected)
        result = df.drop("a", axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["bar", "a", "a"], dtype="float64")
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=["a", "b", "c", "d", "e"], columns=["A", "B", "A"])
        for index in [df.index, pd.Index(list("edcba"))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([("A", expected_ser), ("B", this_df["B"]), ("A", expected_ser)])
            this_df["A"] = index
            check(this_df, expected_df)

        # operations
        for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ["A", "A"]
            df.columns = ["A", "A"]
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
        expected = DataFrame(1.0, index=range(5), columns=["that", "that"])

        df["that"] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
        expected = DataFrame(1, index=range(5), columns=["that", "that"])

        df["that"] = 1
        check(df, expected)
Example #13
0
df5["sum_col"] = df5.apply(sum_two_cols, axis=1)

print(df5)

import math


def int_float_squares(series):
    return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2})


print(df.apply(int_float_squares, axis=1))

### 7. Basic Stats ###

print(df.describe())
print(df.cov())
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###
# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
)["Adj Close"]
prices.head()

volume = pdweb.get_data_yahoo(
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
)["Volume"]
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    # foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer")

    ## on ne garde que les cases de la déclaration ('fxzz')
    # vars <- names(foyer_all)
    # vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    # foyer <- foyer_all[vars]
    # rm(foyer_all)
    # gc()
    # noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    # foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    # print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    # print foyer.describe()["f1aj"].to_string()
    # print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    # foyer$noindiv <- NULL
    # foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    # saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    # loadTmp("foyer_aggr.Rdata")
    #
    # individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    # }

    var_dict = {
        "sali": ["f1aj", "f1bj", "f1cj", "f1dj", "f1ej"],
        "choi": ["f1ap", "f1bp", "f1cp", "f1dp", "f1ep"],
        "fra": ["f1ak", "f1bk", "f1ck", "f1dk", "f1ek"],
        "cho_ld": ["f1ai", "f1bi", "f1ci", "f1di", "f1ei"],
        "ppe_tp_sa": ["f1ax", "f1bx", "f1cx", "f1dx", "f1qx"],
        "ppe_du_sa": ["f1av", "f1bv", "f1cv", "f1dv", "f1qv"],
        "rsti": ["f1as", "f1bs", "f1cs", "f1ds", "f1es"],
        "alr": ["f1ao", "f1bo", "f1co", "f1do", "f1eo"],
        "f1tv": ["f1tv", "f1uv"],
        "f1tw": ["f1tw", "f1uw"],
        "f1tx": ["f1tx", "f1ux"],
        "ppe_tp_ns": ["f5nw", "f5ow", "f5pw"],
        "ppe_du_ns": ["f5nv", "f5ov", "f5pv"],
        "frag_exon": ["f5hn", "f5in", "f5jn"],
        "frag_impo": ["f5ho", "f5io", "f5jo"],
        "arag_exon": ["f5hb", "f5ib", "f5jb"],
        "arag_impg": ["f5hc", "f5ic", "f5jc"],
        "arag_defi": ["f5hf", "f5if", "f5jf"],
        "nrag_exon": ["f5hh", "f5ih", "f5jh"],
        "nrag_impg": ["f5hi", "f5ii", "f5ji"],
        "nrag_defi": ["f5hl", "f5il", "f5jl"],
        "nrag_ajag": ["f5hm", "f5im", "f5jm"],
        "mbic_exon": ["f5kn", "f5ln", "f5mn"],
        "abic_exon": ["f5kb", "f5lb", "f5mb"],
        "nbic_exon": ["f5kh", "f5lh", "f5mh"],
        "mbic_impv": ["f5ko", "f5lo", "f5mo"],
        "mbic_imps": ["f5kp", "f5lp", "f5mp"],
        "abic_impn": ["f5kc", "f5lc", "f5mc"],
        "abic_imps": ["f5kd", "f5ld", "f5md"],
        "nbic_impn": ["f5ki", "f5li", "f5mi"],
        "nbic_imps": ["f5kj", "f5lj", "f5mj"],
        "abic_defn": ["f5kf", "f5lf", "f5mf"],
        "abic_defs": ["f5kg", "f5lg", "f5mg"],
        "nbic_defn": ["f5kl", "f5ll", "f5ml"],
        "nbic_defs": ["f5km", "f5lm", "f5mm"],
        "nbic_apch": ["f5ks", "f5ls", "f5ms"],
        "macc_exon": ["f5nn", "f5on", "f5pn"],
        "aacc_exon": ["f5nb", "f5ob", "f5pb"],
        "nacc_exon": ["f5nh", "f5oh", "f5ph"],
        "macc_impv": ["f5no", "f5oo", "f5po"],
        "macc_imps": ["f5np", "f5op", "f5pp"],
        "aacc_impn": ["f5nc", "f5oc", "f5pc"],
        "aacc_imps": ["f5nd", "f5od", "f5pd"],
        "aacc_defn": ["f5nf", "f5of", "f5pf"],
        "aacc_defs": ["f5ng", "f5og", "f5pg"],
        "nacc_impn": ["f5ni", "f5oi", "f5pi"],
        "nacc_imps": ["f5nj", "f5oj", "f5pj"],
        "nacc_defn": ["f5nl", "f5ol", "f5pl"],
        "nacc_defs": ["f5nm", "f5om", "f5pm"],
        "mncn_impo": ["f5ku", "f5lu", "f5mu"],
        "cncn_bene": ["f5sn", "f5ns", "f5os"],
        "cncn_defi": ["f5sp", "f5nu", "f5ou", "f5sr"],  # TODO: check
        "mbnc_exon": ["f5hp", "f5ip", "f5jp"],
        "abnc_exon": ["f5qb", "f5rb", "f5sb"],
        "nbnc_exon": ["f5qh", "f5rh", "f5sh"],
        "mbnc_impo": ["f5hq", "f5iq", "f5jq"],
        "abnc_impo": ["f5qc", "f5rc", "f5sc"],
        "abnc_defi": ["f5qe", "f5re", "f5se"],
        "nbnc_impo": ["f5qi", "f5ri", "f5si"],
        "nbnc_defi": ["f5qk", "f5rk", "f5sk"],
        "mbic_mvct": ["f5hu"],
        "macc_mvct": ["f5iu"],
        "mncn_mvct": ["f5ju"],
        "mbnc_mvct": ["f5kz"],
        "frag_pvct": ["f5hw", "f5iw", "f5jw"],
        "mbic_pvct": ["f5kx", "f5lx", "f5mx"],
        "macc_pvct": ["f5nx", "f5ox", "f5px"],
        "mbnc_pvct": ["f5hv", "f5iv", "f5jv"],
        "mncn_pvct": ["f5ky", "f5ly", "f5my"],
        "mbic_mvlt": ["f5kr", "f5lr", "f5mr"],
        "macc_mvlt": ["f5nr", "f5or", "f5pr"],
        "mncn_mvlt": ["f5kw", "f5lw", "f5mw"],
        "mbnc_mvlt": ["f5hs", "f5is", "f5js"],
        "frag_pvce": ["f5hx", "f5ix", "f5jx"],
        "arag_pvce": ["f5he", "f5ie", "f5je"],
        "nrag_pvce": ["f5hk", "f5lk", "f5jk"],
        "mbic_pvce": ["f5kq", "f5lq", "f5mq"],
        "abic_pvce": ["f5ke", "f5le", "f5me"],
        "nbic_pvce": ["f5kk", "f5ik", "f5mk"],
        "macc_pvce": ["f5nq", "f5oq", "f5pq"],
        "aacc_pvce": ["f5ne", "f5oe", "f5pe"],
        "nacc_pvce": ["f5nk", "f5ok", "f5pk"],
        "mncn_pvce": ["f5kv", "f5lv", "f5mv"],
        "cncn_pvce": ["f5so", "f5nt", "f5ot"],
        "mbnc_pvce": ["f5hr", "f5ir", "f5jr"],
        "abnc_pvce": ["f5qd", "f5rd", "f5sd"],
        "nbnc_pvce": ["f5qj", "f5rj", "f5sj"],
        "demenage": ["f1ar", "f1br", "f1cr", "f1dr", "f1er"],
    }  # (déménagement) uniquement en 2006

    #
    # varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
    #                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
    #               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
    # ......
    #               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
    #               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
    #               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
    #               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
    #
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables" % (len(set().union(*vars_sets)), len(eligible_vars))
    qui = ["vous", "conj", "pac1", "pac2", "pac3"]
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var, present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index()  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0]
        #        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join="outer")

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()

    # not_first <- FALSE
    # allvars = c()
    # for (v in varlist){
    #  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
    #  if (length(vars) > 0) {
    #    allvars <-  c(allvars, vars)
    #    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
    #    n <- length(vars)
    #    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
    #    if (not_first) {
    #      print('merge')
    #      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
    #      names(foy_ind)
    #    }
    #    else   {
    #      print('init')
    #      foy_ind <- temp
    #      not_first <- TRUE
    #    }
    #  }
    # }

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name="ind_vars_to_remove", year=year)
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind["quifoy"][foy_ind["quifoy"] == "vous"] = 0
    foy_ind["quifoy"][foy_ind["quifoy"] == "conj"] = 1
    foy_ind["quifoy"][foy_ind["quifoy"] == "pac1"] = 2
    foy_ind["quifoy"][foy_ind["quifoy"] == "pac2"] = 3
    foy_ind["quifoy"][foy_ind["quifoy"] == "pac3"] = 4

    assert foy_ind["quifoy"].isin(range(5)).all(), "présence de valeurs aberrantes dans quifoy"

    print "saving foy_ind"
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year=year)
    show_temp()
    return
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame(
        [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list("abcd"), columns=["one", "two"]
    )
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1)  # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list("aabc") * 4)
    print obj.describe()

    methods = [
        "count",
        "min",
        "max",  # 'argmin', 'argmax',
        "quantile",
        "median",
        "mad",
        "var",
        "std",
        "skew",
        "kurt",
        "cummin",
        "cummax",
        "cumprod",
        "diff",
        "pct_change",
    ]

    for method in methods:
        print u"「{0}」".format(method)
        print getattr(df, method)()
        print ""

    # Correspond and Covariance
    all_data = {}
    lst = []  # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst:  # , 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, "1/1/2000", "1/1/2010")
    price = DataFrame({tic: data["Adj Close"] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data["Volume"] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ""
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ""
        print returns.corr()
        print returns.cov()
        print ""
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print "", ""
    obj = Series(list("cadaabbcc"))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(["b", "c"])
    print mask
    print obj[mask]

    data = DataFrame({"Qu1": [1, 3, 4, 3, 4], "Qu2": [2, 3, 1, 2, 3], "Qu3": [1, 5, 2, 4, 4]})
    print data
    print data.apply(pd.value_counts).fillna(0)
                 index=pd.Index(['LA', 'SF', 'SEA', 'POR']),
                 columns=pd.Index(['Type', 'Airport', 'Cool Factor','D']))

# .unstack(): used to convert columns into rows and into a hierarchical index 
df2 = df1.stack(dropna = False)                    # converts columns into the child index
df3 = df1.unstack()                                # converts columns into the parent index 

# .pivot(index, columns, values) is used to reshape data like dplyr in R
df4 = df1.pivot('Airport','Type','Cool Factor')    # yes! its that easy to reshape!

#############################################################################################################
# 9. Outlier Analysis
#############################################################################################################
np.random.seed(12345)
df = DataFrame(np.random.randn(1000,4))
df.describe()                                        # assume outliers are in the -+3 region

df[0][np.abs(df[0])>3]                               # show all rows in column 0 that are > abs(3)
df[(np.abs(df)>3).any(1)]                            # show all values in the dataframe that are > abs(3)
df[np.abs(df)>3] = np.sign(df) * 3                   # caps all values > abs(3) to 3; .sign()                                

#############################################################################################################
# 10. Binning Data
#############################################################################################################
years = [1990,1991,1992,2008,2012,2015,
         1987,1969,2013,2008,1999]
bins = [1960,1970,1980,1990,2000,2010,2020]

### .cut() bins the data in 'years' into a Panda object called Categories
### bins: is a list the specifies the end points of the class intervals
### right: argument specifies if the right edge in inclusive or not'''                    
Example #18
0
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
for tic, data in all_data.iteritems()})
price
volume = DataFrame({tic: data['Volume'] 
# acumulation
print(dframe1.cumsum())
"""
   One  Two  Three
A  1.0  2.0    NaN
B  NaN  5.0    4.0
"""

# describe
print(dframe1)
"""
   One  Two  Three
A  1.0  2.0    NaN
B  NaN  3.0    4.0
"""
print(dframe1.describe())
"""
       One       Two  Three
count  1.0  2.000000    1.0
mean   1.0  2.500000    4.0
std    NaN  0.707107    NaN
min    1.0  2.000000    4.0
25%    1.0  2.250000    4.0
50%    1.0  2.500000    4.0
75%    1.0  2.750000    4.0
max    1.0  3.000000    4.0
"""

# check for unique values
ser1 = Series(["w", "y", "a", "w", "y", "z", "b", "q", "w", "g", "h"])
Example #20
0
        "groc_col": ["apples", "bananas", "coconuts", "dogfood", None],
        "rev_col": range(4, -1, -1),
    }
)
df2 = DataFrame(
    {
        "first_col": [13, 12, -6, -8, -11],
        "second_col": [10.1, 10.2, 10.2, 110.1, None],
        "str_col": ["a", "b", None, "c", "X"],
        "groc_col": [None, "bananas", "coconuts", "dogfood", None],
        "rev_col": range(4, -1, -1),
    }
)

# stats
df1.describe()  # only shows numbers

# gh.ix[:,['float_col', 'int_col']] less elegant
df1[["float_col", "int_col"]]

df1.fillna(value="waiting")

df1["div_col"] = df1["float_col"] / df1["int_col"]

mean = df1["rev_col"].mean()
df1["mean_col"] = mean

new = pd.merge(df1, df2, how="outer", on="str_col")


# quick plotting
p = DataFrame(predicted_probs)


# In[186]:

p.shape


# In[187]:

p.head(2)


# In[188]:

p.describe()


# In[189]:

get_ipython().magic(u"pinfo lr.predict_proba")


# In[190]:

p1 = p[0]
p2 = p[1]


# In[192]:
Example #22
0
empDf.name
empDf.name[2]
empDf[empDf.isManager == False]
empDf.head()
empDf.tail()
empDf.iloc[2,]

#Create new column
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
empDf.sort_index(axis=1)
empDf.sort(['isManager','name'])

empDf.describe()
empDf.id.corr(empDf.deptId)

#Iterate through a data frame
for rowNum, row in auto_data.iterrows():
    for colName, col in row.iteritems():
        if pd.isnull(col):
            print(pd.isnull(col),rowNum,colName)
Example #23
0
    dd_qtr_mean = df_drawdowns.groupby(df_drawdowns.index.quarter).mean()
    dd_qtr_std = df_drawdowns.groupby(df_drawdowns.index.quarter).std()

    # Look at drawdowns on a monthly basis
    mth_mean = df_drawdowns.resample("M", how="mean", kind="period")
    dd_monthly_mean = df_drawdowns.groupby(df_drawdowns.index.month).mean()
    dd_monthly_std = df_drawdowns.groupby(df_drawdowns.index.month).std()

    # Look at one year-2014
    dd_2014 = df_drawdowns["2014-01-01":"2014-12-31"]
    dd_2014_ri = dd_2014.mean().reset_index(name="Average Drawdown in 2014")

    # Creates histograms based on drawdown magnitudes
    bins_dd = np.linspace(0, 30, 61)
    dd_hist = df_drawdowns
    dd_hist.hist(bins=bins_dd, alpha=0.75, color="green", normed=True)
    dd_hist.plot(kind="kde", style="k--")

    """
    Drawdown analysis-This code plots a histogram of a stocks drawdown length
    characteristics.  Ensure that stock ticker used in this function has already
    been placed in the ticker list. 
    """

    stock_dd_length = calc_drawdown_local("WAT", 63)
    dd_len_hist = DataFrame(stock_dd_length)
    bins_len_dd = np.linspace(0, 30, 31)
    dd_len_hist.hist(bins=bins_len_dd, alpha=0.55, color="purple", normed=True)
    plt.title("Drawdown lengths - WAT")
    dd_len_hist.describe()