def _get_comb():
    eretM = load_data('eretM')
    eretM = eretM.stack()
    eretM.index.names = ['t', 'sid']
    eretM.name = 'eret'
    pu = load_data('pu')
    combM = eretM.to_frame().join(pu)
    return combM
Beispiel #2
0
def analyse_outliers(data):
    #TODO: a little messy
    detect_outliers(data, 'data')

    #---------------------------liquidity----------------------------------
    liquidity=load_data('liquidity')
    #amihud
    amihud=liquidity['amihud'].unstack()
    amihud=delete_outliers(amihud,method='percentile',thresh=98,pooled=True)
    detect_outliers(amihud, 'amihud')
    #filter ps1
    ps1=liquidity['ps1'].unstack()
    ps1_pooled=delete_outliers(ps1,method='percentile',thresh=99,pooled=True)
    detect_outliers(ps1_pooled, 'ps1_pooled')
    #ps2
    ps2=liquidity['ps2'].unstack()
    ps2_pooled=delete_outliers(ps2,method='percentile',thresh=99,pooled=True)
    detect_outliers(ps2_pooled, 'ps2_pooled')
    #roll1
    roll1=liquidity['roll1'].unstack()
    roll1_pooled=delete_outliers(roll1,method='percentile',thresh=99,pooled=True)
    detect_outliers(roll1_pooled, 'roll1_pooled')
    #roll2
    roll2=liquidity['roll2'].unstack()
    roll2_pooled=delete_outliers(roll2,method='percentile',thresh=99,pooled=True)
    detect_outliers(roll2_pooled, 'roll2_pooled')
    #zeros1
    zeros1=liquidity['zeros1'].unstack()
    zeros1_pooled=delete_outliers(zeros1,method='percentile',thresh=99,pooled=True)
    detect_outliers(zeros1_pooled, 'zeros1_pooled')
    #zeros2
    zeros2=liquidity['zeros2'].unstack()
    zeros2_pooled=delete_outliers(zeros2,method='percentile',thresh=99,pooled=True)
    detect_outliers(zeros2_pooled, 'zeros2_pooled')

    #----------------------------skewness----------------------------------------
    skewness=load_data('skewness')

    idioskew_24M__D=skewness['idioskew_24M__D'].unstack()
    idioskew_24M__D_pooled=delete_outliers(idioskew_24M__D,method='percentile',thresh=99.9,pooled=True)
    detect_outliers(idioskew_24M__D_pooled,'idioskew_24M__D_pooled')

    skew_12M__D=skewness['skew_12M__D'].unstack()
    skew_12M__D_pooled=delete_outliers(skew_12M__D,method='percentile',thresh=99.9,pooled=True)
    detect_outliers(skew_12M__D_pooled,'skew_12M__D_pooled')

    skew_24M__D=skewness['skew_24M__D'].unstack()
    skew_24M__D_pooled=delete_outliers(skew_24M__D,method='percentile',thresh=99.9,pooled=True)
    detect_outliers(skew_24M__D_pooled,'skew_24M__D_pooled')
Beispiel #3
0
def get_momentum():
    stockRetM=load_data('stockRetM')
    stk=stockRetM.stack()
    stk.index.names=['t','sid']
    #lagged 1 month
    #Te one month lag is imposed to avoid the short-term reversal effect frst documented by Jegadeesh (1990)
    d_lag=OrderedDict({'mom':[12,9],#since the window is 11,and we do not use the value of time t,so,here we set 12 rather than 11
                    'r12':[13,10],
                    'r6':[7,5]})
    #nonlagged
    d_nonlag=OrderedDict({'R12M':[12,10],
                        'R9M':[9,7],
                        'R6M':[6,5],
                        'R3M':[3,3]})
    ss=[]
    names=[]
    for bn,bp in d_lag.items():
        ser=stk.groupby('sid').apply(lambda s:_before(s,bp[0],bp[1]))
        ss.append(ser)
        names.append(bn)

    for un,up in d_nonlag.items():
        ser=stk.groupby('sid').apply(lambda s:_upto(s,up[0],up[1]))
        ss.append(ser)
        names.append(un)

    momentum=pd.concat(ss,axis=1,keys=names)
    momentum.columns.name='type'
    momentum=momentum*100
    momentum.columns.name='type'

    save(momentum,'momentum',sort_axis=False)
Beispiel #4
0
def get_predicted(history):
    params = pd.read_csv(fn, index_col=0, parse_dates=True)
    params = params.rolling(window=history, min_periods=int(
        history / 2)).mean()  #TODO:min_periods
    # we will use the parameters of time t to predict the
    # returns in time t,so shift forward params for 1 step.
    params = params.shift(1)
    indicators = load_data('data')[l]
    indicators['Intercept'] = 1.0
    cols = params.columns
    indicators = indicators.reindex(columns=cols)

    #TODO: predict return rather than eret,
    groups = list(indicators.groupby('sid'))
    ss = []
    names = []
    for name, g in groups:
        g = g.reset_index(level='sid', drop=True)
        p, g = get_inter_frame([params, g.dropna()])
        s = (p * g).sum(axis=1)
        ss.append(s)
        names.append(name)
        print(name)

    predicted = pd.concat(ss, axis=1, keys=names)
    predicted.to_pickle(
        os.path.join(directory, 'predicted_{}.pkl'.format(history)))
Beispiel #5
0
def handle_data():
    size = load_data('size')['size']
    save(size, 'size')

    price = load_data('stockCloseM').stack()
    price.name = 'price'
    save(price, 'price')

    beta = pd.read_csv(os.path.join(PATH, 'beta.csv'),
                       index_col=[0, 1],
                       parse_dates=True)['beta']
    save(beta, 'beta')

    sd = pd.read_csv(os.path.join(PATH, 'sd.csv'),
                     index_col=[0, 1],
                     parse_dates=True)['sd']
    save(sd, 'sd')

    see = pd.read_csv(os.path.join(PATH, 'see.csv'),
                      index_col=[0, 1],
                      parse_dates=True)['see']
    save(see, 'see')

    strev = pd.read_csv(os.path.join(PATH, 'strev.csv'),
                        index_col=[0, 1],
                        parse_dates=True)['strev']
    save(strev, 'strev')

    factors = pd.read_csv(os.path.join(PATH, 'factors.csv'),
                          index_col=[0, 1],
                          parse_dates=True)
    factors.head()

    fns = list(filter(lambda fn: fn.endswith('.csv'), os.listdir(PATH)))
    dfs = []
    for fn in fns:
        df = pd.read_csv(os.path.join(PATH, fn),
                         index_col=[0, 1],
                         parse_dates=True)
        dfs.append(df)
        print(fn, df.shape)

    comb = pd.concat(dfs, axis=1)
    comb = comb.dropna(axis=0, how='all')
    save(comb, 'comb')
Beispiel #6
0
def get_hxz4():
    '''
    calculate hxz4 factors,refer to din.py for details about the indicators

    References:
        Hou, K., Mo, H., Xue, C., and Zhang, L. (2018). Motivating Factors (Rochester, NY: Social Science Research Network).

    Returns:

    '''
    v1 = 'size__size'
    v2 = 'inv__inv'  #I/A
    v3 = 'roe__roe'  # ROE

    comb = combine_with_datalagged([v1, v2, v3], sample_control=True)
    comb = comb.dropna()

    comb['g1'] = comb.groupby('t', group_keys=False).apply(
        lambda df: assign_port_id(df[v1], 2, range(1, 3)))

    comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
        lambda df: assign_port_id(df[v2], [0, 0.3, 0.7, 1.0], range(1, 4)))

    comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
        lambda df: assign_port_id(df[v3], [0, 0.3, 0.7, 1.0], range(1, 4)))

    assets = comb.groupby(
        ['t', 'g1', 'g2',
         'g3']).apply(lambda df: my_average(df, 'stockEretM', wname='weight'))

    df1 = assets.groupby(['t', 'g1']).mean().unstack(level='g1')
    smb = df1[1] - df1[2]

    df2 = assets.groupby(['t', 'g2']).mean().unstack(level='g2')
    ria = df2[3] - df2[1]

    df3 = assets.groupby(['t', 'g3']).mean().unstack(level='g3')
    roe = df3[3] - df2[1]

    rp = load_data('rpM')
    hxz4 = pd.concat([rp, smb, ria, roe],
                     axis=1,
                     keys=['rp', 'smb', 'ria', 'roe'])
    hxz4.columns.name = 'type'
    hxz4 = hxz4.dropna()
    save(hxz4, 'hxz4M')
Beispiel #7
0
def lagged_n(n=0):
    predicted = pd.read_pickle(
        os.path.join(directory, 'predicted_{}.pkl'.format(history)))
    eret = load_data('stockEretM').shift(-n).stack()
    comb = pd.concat([eret, predicted.stack().replace(0, np.nan)],
                     axis=1,
                     keys=['eret', 'predicted'])
    comb.index.names = ['t', 'sid']
    comb = comb.dropna()
    comb['g'] = comb.groupby(
        't', group_keys=False).apply(lambda df: assign_port_id(
            df['predicted'], 10, ['g{}'.format(i) for i in range(1, 11)]))

    ts = comb.groupby(['t', 'g'])['eret'].mean().unstack('g')
    ts.columns = ts.columns.astype(str)
    ts['spread'] = ts['g10'] - ts['g1']
    print(n)
    return ts['spread']
Beispiel #8
0
def regress_predicted_on_realized():
    predicted = pd.read_pickle(
        os.path.join(directory, 'predicted_{}.pkl'.format(history)))
    stockEret = load_data('stockEretM')
    predicted, stockEret = get_inter_frame([predicted, stockEret])

    months = []
    models = []
    count = []
    for month, p in predicted.iterrows():
        # p denotes predicted return
        # r denotes realized return
        r = stockEret.loc[month]
        df = pd.concat([p, r], axis=1, keys=['predicted', 'realized'])
        df = df.dropna()
        model = sm.ols(formula='realized ~ predicted', data=df).fit(use_t=True)
        months.append(month)
        models.append(model)
        count.append(df.shape[0])
        print(month)

    slope = pd.Series([m.params['predicted'] for m in models], index=months)
    r2 = pd.Series([m.rsquared for m in models], index=months)
    n = pd.Series(count, index=months)

    plt.plot(slope.index, slope.values, 'o')
    plt.show()

    plt.plot(r2.index, r2.values, 'o')
    plt.show()

    plt.plot(n.index, n.values)
    plt.show()

    slope.max().max()
    slope.min().min()

    slope.describe()
    r2.describe()

    nw = newey_west(formula='predicted ~ 1',
                    df=pd.DataFrame(slope, columns=['predicted']),
                    lags=5)
def visulize_pu():
    pu = load_data('pu')
    pu.plot()
    plt.show()