Exemple #1
0
def get_op():
    '''
    calculate operating profitability as in FF5

    Returns:

    '''

    # --------------operating probability---------------
    tbname = 'FS_Comins'
    # var1='B001101000' # 营业收入
    # var2='B001201000' # 营业成本
    # var3='B001209000' # 销售费用
    # var4='B001210000' # 管理费用
    # var5='B001211000' # 财务费用
    var = 'B001300000'  # 营业利润
    # var7='Bbd1102203' # 利息支出
    OP = parse_financial_report(tbname, var)

    # ----------------book value---------------
    tbname = 'FS_Combas'
    # var1 = 'A003000000'  # 所有者权益合计
    var = 'A003100000'  # 归属于母公司所有者权益合计
    BV = parse_financial_report(tbname, var)
    BV[BV <=
       0] = np.nan  #Trick: delete those samples with a negative denominator
    OP, BV = get_inter_frame([OP, BV])
    op = OP / BV
    op.index.name = 't'
    op.columns.name = 'sid'
    op = quaterly2monthly(op)
    save(op, 'op')
Exemple #2
0
def get_momentum():
    stockRetM=load_data('stockRetM')
    stk=stockRetM.stack()
    stk.index.names=['t','sid']
    #lagged 1 month
    #Te one month lag is imposed to avoid the short-term reversal effect frst documented by Jegadeesh (1990)
    d_lag=OrderedDict({'mom':[12,9],#since the window is 11,and we do not use the value of time t,so,here we set 12 rather than 11
                    'r12':[13,10],
                    'r6':[7,5]})
    #nonlagged
    d_nonlag=OrderedDict({'R12M':[12,10],
                        'R9M':[9,7],
                        'R6M':[6,5],
                        'R3M':[3,3]})
    ss=[]
    names=[]
    for bn,bp in d_lag.items():
        ser=stk.groupby('sid').apply(lambda s:_before(s,bp[0],bp[1]))
        ss.append(ser)
        names.append(bn)

    for un,up in d_nonlag.items():
        ser=stk.groupby('sid').apply(lambda s:_upto(s,up[0],up[1]))
        ss.append(ser)
        names.append(un)

    momentum=pd.concat(ss,axis=1,keys=names)
    momentum.columns.name='type'
    momentum=momentum*100
    momentum.columns.name='type'

    save(momentum,'momentum',sort_axis=False)
Exemple #3
0
def get_inv():
    '''
    calculate the growth of total asset:
        I/A in Hou, Xue, and Zhang, “Digesting Anomalies.”
        inv in Fama and French, “A Five-Factor Asset Pricing Model.”

    this indicator is the same as ROE calculated as follows:
        tbname='FI_T8'
        varname='F080602A'
        roe=parse_financial_report(tbname, varname)
        roe=quaterly2monthly(roe)

    References:
        Hou, K., Xue, C., and Zhang, L. (2014). Digesting Anomalies: An Investment Approach. Review of Financial Studies 28, 650–705.

    Returns:

    '''
    tbname = 'FS_Combas'
    #book value
    varname = 'A001000000'  # 总资产
    ta = parse_financial_report(tbname, varname)
    ta[ta <= 0] = np.nan  #trick: delete samples with negative denominator
    inv = ta.pct_change()
    inv = quaterly2monthly(inv)
    save(inv, 'inv')
Exemple #4
0
def get_stInfo():
    '''
    for freq='M',delete all the months as long as ST or *ST appear in any day of that month,
    for freq='D',we only delete the current day with ST or *ST
    :return:
    '''
    #TODO: how about PT
    df = pd.read_csv(os.path.join(DATA_SRC, 'TRD_Dalyr.csv'), encoding='gbk')
    df = df[['Trddt', 'Stkcd', 'Trdsta']]
    df.columns = ['t', 'sid', 'status']
    df['t'] = pd.to_datetime(df['t'])

    def func(df):
        # for information about the status refer to the documents
        result = (2.0 not in df['status'].values) & (
            3.0 not in df['status'].values)
        return result

    df1 = df.groupby([pd.Grouper(key='t', freq='M'), 'sid']).filter(func)

    dfM = df1.groupby([pd.Grouper(key='t', freq='M'), 'sid']).sum()
    dfM['not_st'] = True
    dfM = dfM['not_st']
    dfM = dfM.unstack()
    dfM.columns = dfM.columns.astype(str)

    df['not_st'] = True
    dfD = df.set_index(['t', 'sid'])[['not_st']]
    dfD = dfD.sort_index(level='t')

    dfD = dfD['not_st'].unstack()
    dfD.columns = dfD.columns.astype(str)

    save(dfD, 'stInfoD')
    save(dfM, 'stInfoM')
Exemple #5
0
def get_roe():
    '''
    roe in HXZ

    References:
        Hou, K., Xue, C., and Zhang, L. (2014). Digesting Anomalies:An Investment Approach. Review of Financial Studies 28, 650–705.


    Returns:

    '''
    tbname1 = 'FS_Comins'
    varname1 = 'B002000000'  # 净利润

    income = parse_financial_report(tbname1, varname1, freq='Q')

    tbname2 = 'FS_Combas'
    # var1 = 'A003000000'  # 所有者权益合计
    var2 = 'A003100000'  # 归属于母公司所有者权益合计
    BV = parse_financial_report(tbname2, var2, freq='Q')
    BV[BV <= 0] = np.nan  #trick: delete samples with negative denominator
    roe = income / BV.shift(
        1, freq='3M')  #trick:divide by one-quarter-lagged book equity
    #TODO: adjust with the announcement date
    '''
    It is a little different with the paper.To take time lag into consideration,we 
    just shift forward 6 month here but what the paper has done is " Earnings data
    in Compustat quarterly files are used in the months immediately after the most 
    recent public quarterly earnings announcement dates."
    '''
    roe = quaterly2monthly(roe, shift='6M')
    save(roe, 'roe')
Exemple #6
0
def get_bm():
    '''
    this function can be bookmarked as a snippet of how to manipulate date index
    in Pandas

    A little different with the book,here we use be and me for one share,
    but the data in the book is for all floating shares.However,it doesn't
    affect the bm.

    :return:
    '''
    # be=load_data('bps')
    be = read_unfiltered('bps')
    be = be[be.index.month == 12]
    me = read_unfiltered('stockCloseY')
    # me=load_data('stockCloseY')
    be, me = get_inter_frame([be, me])
    # me[me<=0]=np.nan
    bm = be / me
    bm[bm <= 0] = np.nan  #delete those samples with bm<0
    bm = quaterly2monthly(bm, shift='6M')
    logbm = np.log(bm)

    bm = bm.stack()
    logbm = logbm.stack()
    x = pd.concat([bm, logbm], axis=1, keys=['bm', 'logbm'])
    x.index.names = ['t', 'sid']
    x.columns.name = 'type'

    save(x, 'value')
Exemple #7
0
def cal_sizes():
    # mktCap=load_data('capM')
    mktCap = read_unfiltered('capM')
    mktCap[mktCap <= 0] = np.nan
    size = np.log(mktCap)

    junes = [m for m in mktCap.index.tolist() if m.month == 6]
    newIndex = pd.date_range(start=junes[0], end=mktCap.index[-1], freq='M')
    junesDf = mktCap.loc[junes]
    mktCap_ff = junesDf.reindex(index=newIndex)
    mktCap_ff = mktCap_ff.ffill(
        limit=11)  # limit=11 is required,or it will fill all NaNs forward.

    size_ff = np.log(mktCap_ff)

    size = size.stack()
    size.name = 'size'
    mktCap_ff = mktCap_ff.stack()
    mktCap_ff.name = 'mktCap_ff'
    size_ff = size_ff.stack()
    size_ff.name = 'size_ff'

    mktCap = mktCap.stack()
    mktCap.name = 'mktCap'
    # combine
    x = pd.concat([mktCap, mktCap_ff, size, size_ff], axis=1)
    x.index.names = ['t', 'sid']
    x.columns.name = 'type'

    save(x, 'size')
def get_amihud_illiq():
    df = read_gta('TRD_Dalyr')
    df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']]
    df.columns = ['sid', 't', 'ret', 'volume']
    df['t'] = freq_end(df['t'], 'D')
    df = df.set_index(['t', 'sid'])
    if not df.index.is_monotonic_increasing:
        df = df.sort_index(
            level='t'
        )  #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts

    dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200})

    result = groupby_rolling(df, 'illiq', dict, _amihud)
    result.index.names = ['type', 't']

    ln_result = np.log(result)
    ln_result = ln_result.reset_index()
    ln_result['type'] = 'ln_' + ln_result['type'].astype(str)
    ln_result = ln_result.set_index(['type', 't'])
    illiq = pd.concat([result, ln_result], axis=0)
    #TODO:use valid observation for the whole project as page 276

    # adjust the format of the DataFrame
    illiq.columns = pd.Index(illiq.columns.astype(str), illiq.columns.name)
    illiq = illiq.reset_index()
    illiq['t'] = freq_end(illiq['t'], 'M')
    illiq = illiq.set_index(['type', 't'])
    illiq = illiq.stack().unstack(level='type')

    #TODO: The data is really noisy,refer to outliers figures for details
    save(illiq, 'illiq')
def calculate_beta1():
    arg_list = get_arg_list()
    ss = multiprocessing.Pool(2).map(task1, arg_list)
    df = pd.concat(ss,
                   axis=1,
                   keys=['{}{}'.format(arg.freq, arg.w) for arg in arg_list])
    df = df.unstack('sid').resample('M').last().stack()  # convert to monthly
    save(df, 'beta', sort_axis=False)
Exemple #10
0
def get_ep():
    df = read_gta('STK_MKT_Dalyr', encoding='gbk')
    df['t'] = pd.to_datetime(df['TradingDate'])
    df['sid'] = df['Symbol'].astype(str)
    df['ep'] = 1.0 / df['PE']
    cfpr = pd.pivot_table(df, values='ep', index='t', columns='sid')
    cfpr = cfpr.sort_index().resample('M').last()
    save(cfpr, 'ep')
def cal_sen():
    dictM = OrderedDict({'12M': 10, '24M': 20, '36M': 24, '60M': 24})
    combM = _get_comb()
    sen = groupby_rolling(combM, 'M', dictM, _sensitivity_to_pu)
    sen = sen * 100
    sen = sen.stack().unstack(level=0)
    sen.index.names = ['t', 'sid']
    sen.columns.name = 'type'
    save(sen, 'sen', outliers=False)
Exemple #12
0
def get_tradingStatusD():
    df = read_gta('TRD_Dalyr', encoding='gbk')
    #Trick: Trdsta==1 means "正常交易"
    df['is_normal'] = df['Trdsta'] == 1.0
    df['t'] = pd.to_datetime(df['Trddt'])
    df['sid'] = df['Stkcd'].astype(str)
    status = pd.pivot_table(df, values='is_normal', index='t', columns='sid')
    save(status, 'tradingStatusD')
    return status
Exemple #13
0
def get_stockEretD():
    stockRetD = get_stockRetD()
    rfD = get_rfD()
    stockEretD = stockRetD.sub(rfD, axis=0)
    # The date for stockRetD is buisiness date,but for rfD, it is calendar date.
    stockEretD = stockEretD.dropna(
        axis=0, how='all'
    )  # use this to ajust the index from calendar date to buisiness date

    save(stockEretD, 'stockEretD')
    return stockEretD
Exemple #14
0
def get_bps():
    tbname = 'FI_T9'
    varname = 'F091001A'  # 每股净资产
    indname = 'Accper'
    colname = 'Stkcd'
    df = read_df_from_gta(tbname, varname, indname, colname)
    df.index.name = 't'
    df.index = pd.to_datetime(df.index)
    df.columns = df.columns.astype(str)
    df.columns.name = 'sid'
    save(df, 'bps')
Exemple #15
0
def get_ff3M():  #fixme: there are some abnormal values
    df = read_gta('STK_MKT_ThrfacMonth')
    #trick:P9709 全部A股市场包含沪深A股和创业板
    #trick:流通市值加权
    df = df[df['MarkettypeID'] == 'P9709'][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml']
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')
    df.columns.name = 'type'
    save(df, 'ff3M')
    return df
Exemple #16
0
def get_ffcM():
    df = read_gta('STK_MKT_CarhartFourFactors')
    #trick: P9709 全部A股市场包含沪深A股和创业板
    #trick: 流通市值加权
    df = df[df['MarkettypeID'] == 'P9709'][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'mom']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')

    save(df, 'ffcM')
Exemple #17
0
def get_ff3D():
    tbname = 'STK_MKT_ThrfacDay'
    df = read_gta(tbname)
    condition1 = df['MarkettypeID'] == 'P9707'
    # P9709 全部A股市场包含沪深A股和创业板.
    # 流通市值加权
    df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']]
    df.columns = ['t', 'rp', 'smb', 'hml']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'D')
    save(df, 'ff3D')
    return df
Exemple #18
0
def get_bps_wind():
    '''
    from code generator by use

    w.wsd("000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ", "bps", "2017-02-04", "2018-03-05", "currencyType=;Period=Q;Fill=Previous")

    :return:
    '''
    df = read_wind('bps', freq='M')
    df.index.name = 't'
    df.columns.name = 'sid'

    save(df, 'bps_wind')
Exemple #19
0
def get_pu():
    '''
    policy uncertainty
    :return:
    '''
    url = r'http://www.policyuncertainty.com/media/China_Policy_Uncertainty_Data.xlsx'
    pu = pd.read_excel(url, skip_footer=1)
    pu.columns = ['year', 'month', 'pu']
    pu['t'] = pu['year'].map(str) + '-' + pu['month'].map(str)
    pu['t'] = freq_end(pu['t'], 'M')
    pu = pu.set_index('t')
    pu = pu['pu']
    save(pu, 'pu')
Exemple #20
0
def get_ff5M():
    df = read_gta('STK_MKT_FivefacMonth')
    #trick:P9709 全部A股市场包含沪深A股和创业板
    #trick:流通市值加权
    #trick: 2*3 投资组合
    df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')
    # df.index.name='t'
    save(df, 'ff5M')
Exemple #21
0
def get_stockRetD():
    # get stock daily stock return
    tbname = 'TRD_Dalyr'
    varname = 'Dretwd'  #考虑现金红利再投资的收益
    indname = 'Trddt'
    colname = 'Stkcd'
    df = read_df_from_gta(tbname, varname, indname, colname)

    df.index.name = 't'
    df.index = pd.to_datetime(df.index)  #TODO: dayend?
    df.columns.name = 'sid'
    df.columns = df.columns.astype(str)

    save(df, 'stockRetD')
    return df
Exemple #22
0
def get_ff6():
    v1='size__size'
    v2='momentum__r12'

    smb,mom=two_sorting_factor(v1,v2,2,[0,0.3,0.7,1.0],sample_control=False,
                               independent=True)
    mom.index.name='t'
    mom.name='mom'

    ff5=read_unfiltered('ff5M')
    ff6=pd.concat([ff5,mom],axis=1)
    ff6=ff6.dropna()
    ff6.columns.name='type'

    save(ff6,'ff6M')
Exemple #23
0
def get_mktRetM():
    tbname = 'TRD_Cnmont'
    indVar = 'Trdmnt'
    targetVar = 'Cmretwdos'  #trick:考虑现金红利再投资的综合日市场回报率(流通市值加权平均法)

    df = read_gta(tbname)
    df = df[df['Markettype'] == 21]  # 21=综合A股和创业板

    df = df.set_index(indVar)
    df.index = freq_end(df.index, 'M')
    df.index.name = 't'

    s = df[targetVar]
    s.name = 'mktRetM'

    save(s, 'mktRetM')
Exemple #24
0
def cal_beta():
    dictD = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200, '24M': 450})#TODO: why so many months are lost? refer to betaD.csv
    dictM = OrderedDict({'12M': 10, '24M': 20, '36M': 24, '60M': 24})
    combD,combM=_get_comb()

    betaD=groupby_rolling(combD,'D',dictD,_beta)
    betaM=groupby_rolling(combM,'M',dictM,_beta)

    betaD=betaD.stack().unstack(level=0)
    betaM=betaM.stack().unstack(level=0)

    #combine
    x = pd.concat([betaD, betaM], axis=1)
    x.index.names = ['t', 'sid']
    x.columns.name = 'type'

    save(x,'beta',sort_axis=False)
Exemple #25
0
def get_hxz4():
    '''
    calculate hxz4 factors,refer to din.py for details about the indicators

    References:
        Hou, K., Mo, H., Xue, C., and Zhang, L. (2018). Motivating Factors (Rochester, NY: Social Science Research Network).

    Returns:

    '''
    v1 = 'size__size'
    v2 = 'inv__inv'  #I/A
    v3 = 'roe__roe'  # ROE

    comb = combine_with_datalagged([v1, v2, v3], sample_control=True)
    comb = comb.dropna()

    comb['g1'] = comb.groupby('t', group_keys=False).apply(
        lambda df: assign_port_id(df[v1], 2, range(1, 3)))

    comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
        lambda df: assign_port_id(df[v2], [0, 0.3, 0.7, 1.0], range(1, 4)))

    comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply(
        lambda df: assign_port_id(df[v3], [0, 0.3, 0.7, 1.0], range(1, 4)))

    assets = comb.groupby(
        ['t', 'g1', 'g2',
         'g3']).apply(lambda df: my_average(df, 'stockEretM', wname='weight'))

    df1 = assets.groupby(['t', 'g1']).mean().unstack(level='g1')
    smb = df1[1] - df1[2]

    df2 = assets.groupby(['t', 'g2']).mean().unstack(level='g2')
    ria = df2[3] - df2[1]

    df3 = assets.groupby(['t', 'g3']).mean().unstack(level='g3')
    roe = df3[3] - df2[1]

    rp = load_data('rpM')
    hxz4 = pd.concat([rp, smb, ria, roe],
                     axis=1,
                     keys=['rp', 'smb', 'ria', 'roe'])
    hxz4.columns.name = 'type'
    hxz4 = hxz4.dropna()
    save(hxz4, 'hxz4M')
Exemple #26
0
def get_capM():
    '''
    get stock monthly circulation market capitalization

    :return:
    '''
    tbname = 'TRD_Mnth'
    varname = 'Msmvosd'  #trick:月个股流通市值,单位 千元
    # TODO:the unit convert it to million as Cakici, Chan, and Topyan, “Cross-Sectional Stock Return Predictability in China.”
    indname = 'Trdmnt'
    colname = 'Stkcd'
    df = read_df_from_gta(tbname, varname, indname, colname)
    df.index.name = 't'
    df.index = freq_end(df.index, 'M')
    df.columns = df.columns.astype(str)
    df.columns.name = 'sid'
    save(df, 'capM')
Exemple #27
0
def get_mktRetD():
    # get daily market return

    tbname = 'TRD_Cndalym'
    indVar = 'Trddt'
    targetVar = 'Cdretwdos'  #trick 考虑现金红利再投资的综合日市场回报率(流通市值加权平均法)
    df = read_gta(tbname)

    condition1 = df['Markettype'] == 21  #trick 21=综合A股和创业板
    df = df[condition1]

    df = df.set_index(indVar)
    df.index.name = 't'
    df.index = pd.to_datetime(df.index)

    s = df[targetVar]
    s.name = 'mktRetD'  #TODO: put this line into check function or unify function?
    save(s, 'mktRetD')
Exemple #28
0
def get_stockCloseD():
    #get daily stock close price
    tbname = 'TRD_Dalyr'
    varname = 'Adjprcwd'  # adjusted close price with dividend taken into consideration
    indname = 'Trddt'
    colname = 'Stkcd'
    df = read_df_from_gta(tbname, varname, indname, colname)

    df.index.name = 't'
    df.index = pd.to_datetime(df.index)
    df.columns.name = 'sid'
    df.columns = df.columns.astype(str)

    df = df.sort_index(axis=0)
    df = df.sort_index(axis=1)

    save(df, 'stockCloseD')
    return df
Exemple #29
0
def get_beta_sw_dm():
    '''
    refer to page 5 of cakici for details about this beta.

    Returns:

    '''
    #beta
    rf=read_filtered('rfD')
    rm=read_filtered('mktRetD')
    ri=read_filtered('stockRetD')
    df=ri.stack().to_frame()
    df.columns=['ri']
    df=df.join(pd.concat([rf,rm],axis=1))
    df.columns=['ri','rf','rm']
    df.index.names=['t','sid']

    df['y']=df['ri']-df['rf']
    df['x2']=df['rm']-df['rf']
    df['x1']=df.groupby('sid')['x2'].shift(1)

    def _cal_beta(x):
        result=sm.ols('y ~ x1 + x2',data=x).fit().params[['x1','x2']]
        return result.sum()

    def _for_one_sid(x):
        # x is multiIndex Dataframe
        nx=x.reset_index('sid')
        sid=nx['sid'][0]
        print(sid)
        _get_monthend=lambda dt:dt+MonthEnd(0)
        #filter out those months with observations less than MIN_SAMPLES
        nx=nx.groupby(_get_monthend).filter(lambda a: a.dropna().shape[0] >= MIN_SAMPLES)
        if nx.shape[0]>0:
            result=nx.groupby(_get_monthend).apply(_cal_beta)
            return result

    beta=df.groupby('sid').apply(_for_one_sid)
    beta.index.names=['sid','t']
    beta=beta.reorder_levels(['t','sid']).sort_index(level='t')
    beta.name='beta'

    save(beta,'beta_sw_dm')
Exemple #30
0
def get_ff3M_resset():
    '''
    from resset data

    :return:
    '''
    tbname = 'THRFACDAT_MONTHLY'
    df = read_resset(tbname)
    # 'Exchflg == 0'   所有交易所
    # 'Mktflg == A'    只考虑A股
    df = df[(df['Exchflg'] == 0) & (df['Mktflg'] == 'A')]
    df = df.set_index('Date')
    df.index = freq_end(df.index, 'M')
    df.index.name = 't'
    df = df[['Rmrf_tmv', 'Smb_tmv',
             'Hml_tmv']]  #weighted with tradable capitalization
    df.columns = ['rp', 'smb', 'hml']
    df.columns.name = 'type'
    save(df, 'ff3M_resset')