Example #1
0
    :param df1:
    :type df1: pandas.core.frame.DataFrame
    :param df2:
    :type df2: pandas.core.frame.DataFrame
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    res = []
    for i in range(df2.shape[1]):
        res.append(df1.corrwith(df2.ix[:, i]))
    res = pd.concat(res, axis=1)
    res.columns = df2.columns
    return res

pairwise_corr(df1, df3)
df1.corrwith(df3.h)


def corr_df3(obj):
    """
    :param obj:
    :type obj: pandas.core.frame.DataFrame
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    return df3.corrwith(obj)
df1.apply(corr_df3)
df1.apply(lambda x: df3.corrwith(x))
df3.apply(lambda x: df1.corrwith(x))

df3.index
Example #2
0
N = 1000
tickers = np.array([rands(5) for _ in xrange(N)])

# 下面是一个投资组合, 由三个随机生成的因子(通常称为因子载荷)和一些权重构成
fac1, fac2, fac3 = np.random.rand(3, 1000)

ticker_subset = tickers.take(np.random.permutation(N)[:1000])

# 因子加权和以及噪声
port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000),
              index=ticker_subset)
factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},
                    index=ticker_subset)

print factors.corrwith(port)

# 计算因子暴露的标准方式是最小二乘回归
# 使用pandas.ols来计算整个投资组合的暴露
print pd.ols(y=port, x=factors).beta


# 通过groupby计算各行业的暴露量
def beta_exposure(chunk, factors=None):
    return pd.ols(y=chunk, x=factors).beta

ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = Series(ind_names[sampler], index=tickers,
                    name='industry')
Example #3
0
    return (group - group.mean()) / group.std()


df_stand = by_industry.apply(zscore)
print(df_stand.groupby(industries).agg(['mean', 'std']))
ind_rank = by_industry.rank(ascending=False)
print(ind_rank.groupby(industries).agg(['min', 'max']))
print(by_industry.apply(lambda x: zscore(x.rank())))

fac1, fac2, fac3 = np.random.rand(3, 1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
port = Series(0.7 * fac1 + 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000), index=ticker_subset)

factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset)

print(factors.corrwith(port))
print(pd.ols(y=port, x=factors).beta)


def beta_exposure(chunk, factors=None):
    return pd.ols(y=chunk, x=factors).beta


by_ind = port.groupby(industries)
exposures = by_ind.apply(beta_exposure, factors=factors)
print(exposures.unstack())

data = web.get_data_yahoo('SPY', '2006-01-01','2012-07-27')


px = data['Adj Close']
Example #4
0
# descriptive statistics
df = DataFrame([[1.4,np.nan],[7.1,-4.5],
               [np.nan,np.nan],[0.75,-1.3]],
               index=list('abcd'),
               columns=['one','two'])
df.describe() 
# skipna=True, mean, std, var, sum, 
# max, min, argmax, argmin, idxmax, idxmin,
# cumsum, cumprod, diff, pct_change

# Correlation and Covariance
df = DataFrame(np.random.randn(100,3), columns=list('abc'))
df.corr() 
df.cov()
df.corrwith(df['a'])

# unique values, value counts, membership
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
obj.value_counts()
mask = obj.isin(['b','c'])
obj[mask]

# deal with missing data
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1] = np.nan; df.ix[:2,2] = np.nan

df.dropna(thresh=3)
df.fillna(0)
df.fillna({1:0.5,3:-1})