def get_op(): ''' calculate operating profitability as in FF5 Returns: ''' # --------------operating probability--------------- tbname = 'FS_Comins' # var1='B001101000' # 营业收入 # var2='B001201000' # 营业成本 # var3='B001209000' # 销售费用 # var4='B001210000' # 管理费用 # var5='B001211000' # 财务费用 var = 'B001300000' # 营业利润 # var7='Bbd1102203' # 利息支出 OP = parse_financial_report(tbname, var) # ----------------book value--------------- tbname = 'FS_Combas' # var1 = 'A003000000' # 所有者权益合计 var = 'A003100000' # 归属于母公司所有者权益合计 BV = parse_financial_report(tbname, var) BV[BV <= 0] = np.nan #Trick: delete those samples with a negative denominator OP, BV = get_inter_frame([OP, BV]) op = OP / BV op.index.name = 't' op.columns.name = 'sid' op = quaterly2monthly(op) save(op, 'op')
def get_momentum(): stockRetM=load_data('stockRetM') stk=stockRetM.stack() stk.index.names=['t','sid'] #lagged 1 month #Te one month lag is imposed to avoid the short-term reversal effect frst documented by Jegadeesh (1990) d_lag=OrderedDict({'mom':[12,9],#since the window is 11,and we do not use the value of time t,so,here we set 12 rather than 11 'r12':[13,10], 'r6':[7,5]}) #nonlagged d_nonlag=OrderedDict({'R12M':[12,10], 'R9M':[9,7], 'R6M':[6,5], 'R3M':[3,3]}) ss=[] names=[] for bn,bp in d_lag.items(): ser=stk.groupby('sid').apply(lambda s:_before(s,bp[0],bp[1])) ss.append(ser) names.append(bn) for un,up in d_nonlag.items(): ser=stk.groupby('sid').apply(lambda s:_upto(s,up[0],up[1])) ss.append(ser) names.append(un) momentum=pd.concat(ss,axis=1,keys=names) momentum.columns.name='type' momentum=momentum*100 momentum.columns.name='type' save(momentum,'momentum',sort_axis=False)
def get_inv(): ''' calculate the growth of total asset: I/A in Hou, Xue, and Zhang, “Digesting Anomalies.” inv in Fama and French, “A Five-Factor Asset Pricing Model.” this indicator is the same as ROE calculated as follows: tbname='FI_T8' varname='F080602A' roe=parse_financial_report(tbname, varname) roe=quaterly2monthly(roe) References: Hou, K., Xue, C., and Zhang, L. (2014). Digesting Anomalies: An Investment Approach. Review of Financial Studies 28, 650–705. Returns: ''' tbname = 'FS_Combas' #book value varname = 'A001000000' # 总资产 ta = parse_financial_report(tbname, varname) ta[ta <= 0] = np.nan #trick: delete samples with negative denominator inv = ta.pct_change() inv = quaterly2monthly(inv) save(inv, 'inv')
def get_stInfo(): ''' for freq='M',delete all the months as long as ST or *ST appear in any day of that month, for freq='D',we only delete the current day with ST or *ST :return: ''' #TODO: how about PT df = pd.read_csv(os.path.join(DATA_SRC, 'TRD_Dalyr.csv'), encoding='gbk') df = df[['Trddt', 'Stkcd', 'Trdsta']] df.columns = ['t', 'sid', 'status'] df['t'] = pd.to_datetime(df['t']) def func(df): # for information about the status refer to the documents result = (2.0 not in df['status'].values) & ( 3.0 not in df['status'].values) return result df1 = df.groupby([pd.Grouper(key='t', freq='M'), 'sid']).filter(func) dfM = df1.groupby([pd.Grouper(key='t', freq='M'), 'sid']).sum() dfM['not_st'] = True dfM = dfM['not_st'] dfM = dfM.unstack() dfM.columns = dfM.columns.astype(str) df['not_st'] = True dfD = df.set_index(['t', 'sid'])[['not_st']] dfD = dfD.sort_index(level='t') dfD = dfD['not_st'].unstack() dfD.columns = dfD.columns.astype(str) save(dfD, 'stInfoD') save(dfM, 'stInfoM')
def get_roe(): ''' roe in HXZ References: Hou, K., Xue, C., and Zhang, L. (2014). Digesting Anomalies:An Investment Approach. Review of Financial Studies 28, 650–705. Returns: ''' tbname1 = 'FS_Comins' varname1 = 'B002000000' # 净利润 income = parse_financial_report(tbname1, varname1, freq='Q') tbname2 = 'FS_Combas' # var1 = 'A003000000' # 所有者权益合计 var2 = 'A003100000' # 归属于母公司所有者权益合计 BV = parse_financial_report(tbname2, var2, freq='Q') BV[BV <= 0] = np.nan #trick: delete samples with negative denominator roe = income / BV.shift( 1, freq='3M') #trick:divide by one-quarter-lagged book equity #TODO: adjust with the announcement date ''' It is a little different with the paper.To take time lag into consideration,we just shift forward 6 month here but what the paper has done is " Earnings data in Compustat quarterly files are used in the months immediately after the most recent public quarterly earnings announcement dates." ''' roe = quaterly2monthly(roe, shift='6M') save(roe, 'roe')
def get_bm(): ''' this function can be bookmarked as a snippet of how to manipulate date index in Pandas A little different with the book,here we use be and me for one share, but the data in the book is for all floating shares.However,it doesn't affect the bm. :return: ''' # be=load_data('bps') be = read_unfiltered('bps') be = be[be.index.month == 12] me = read_unfiltered('stockCloseY') # me=load_data('stockCloseY') be, me = get_inter_frame([be, me]) # me[me<=0]=np.nan bm = be / me bm[bm <= 0] = np.nan #delete those samples with bm<0 bm = quaterly2monthly(bm, shift='6M') logbm = np.log(bm) bm = bm.stack() logbm = logbm.stack() x = pd.concat([bm, logbm], axis=1, keys=['bm', 'logbm']) x.index.names = ['t', 'sid'] x.columns.name = 'type' save(x, 'value')
def cal_sizes(): # mktCap=load_data('capM') mktCap = read_unfiltered('capM') mktCap[mktCap <= 0] = np.nan size = np.log(mktCap) junes = [m for m in mktCap.index.tolist() if m.month == 6] newIndex = pd.date_range(start=junes[0], end=mktCap.index[-1], freq='M') junesDf = mktCap.loc[junes] mktCap_ff = junesDf.reindex(index=newIndex) mktCap_ff = mktCap_ff.ffill( limit=11) # limit=11 is required,or it will fill all NaNs forward. size_ff = np.log(mktCap_ff) size = size.stack() size.name = 'size' mktCap_ff = mktCap_ff.stack() mktCap_ff.name = 'mktCap_ff' size_ff = size_ff.stack() size_ff.name = 'size_ff' mktCap = mktCap.stack() mktCap.name = 'mktCap' # combine x = pd.concat([mktCap, mktCap_ff, size, size_ff], axis=1) x.index.names = ['t', 'sid'] x.columns.name = 'type' save(x, 'size')
def get_amihud_illiq(): df = read_gta('TRD_Dalyr') df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']] df.columns = ['sid', 't', 'ret', 'volume'] df['t'] = freq_end(df['t'], 'D') df = df.set_index(['t', 'sid']) if not df.index.is_monotonic_increasing: df = df.sort_index( level='t' ) #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200}) result = groupby_rolling(df, 'illiq', dict, _amihud) result.index.names = ['type', 't'] ln_result = np.log(result) ln_result = ln_result.reset_index() ln_result['type'] = 'ln_' + ln_result['type'].astype(str) ln_result = ln_result.set_index(['type', 't']) illiq = pd.concat([result, ln_result], axis=0) #TODO:use valid observation for the whole project as page 276 # adjust the format of the DataFrame illiq.columns = pd.Index(illiq.columns.astype(str), illiq.columns.name) illiq = illiq.reset_index() illiq['t'] = freq_end(illiq['t'], 'M') illiq = illiq.set_index(['type', 't']) illiq = illiq.stack().unstack(level='type') #TODO: The data is really noisy,refer to outliers figures for details save(illiq, 'illiq')
def calculate_beta1(): arg_list = get_arg_list() ss = multiprocessing.Pool(2).map(task1, arg_list) df = pd.concat(ss, axis=1, keys=['{}{}'.format(arg.freq, arg.w) for arg in arg_list]) df = df.unstack('sid').resample('M').last().stack() # convert to monthly save(df, 'beta', sort_axis=False)
def get_ep(): df = read_gta('STK_MKT_Dalyr', encoding='gbk') df['t'] = pd.to_datetime(df['TradingDate']) df['sid'] = df['Symbol'].astype(str) df['ep'] = 1.0 / df['PE'] cfpr = pd.pivot_table(df, values='ep', index='t', columns='sid') cfpr = cfpr.sort_index().resample('M').last() save(cfpr, 'ep')
def cal_sen(): dictM = OrderedDict({'12M': 10, '24M': 20, '36M': 24, '60M': 24}) combM = _get_comb() sen = groupby_rolling(combM, 'M', dictM, _sensitivity_to_pu) sen = sen * 100 sen = sen.stack().unstack(level=0) sen.index.names = ['t', 'sid'] sen.columns.name = 'type' save(sen, 'sen', outliers=False)
def get_tradingStatusD(): df = read_gta('TRD_Dalyr', encoding='gbk') #Trick: Trdsta==1 means "正常交易" df['is_normal'] = df['Trdsta'] == 1.0 df['t'] = pd.to_datetime(df['Trddt']) df['sid'] = df['Stkcd'].astype(str) status = pd.pivot_table(df, values='is_normal', index='t', columns='sid') save(status, 'tradingStatusD') return status
def get_stockEretD(): stockRetD = get_stockRetD() rfD = get_rfD() stockEretD = stockRetD.sub(rfD, axis=0) # The date for stockRetD is buisiness date,but for rfD, it is calendar date. stockEretD = stockEretD.dropna( axis=0, how='all' ) # use this to ajust the index from calendar date to buisiness date save(stockEretD, 'stockEretD') return stockEretD
def get_bps(): tbname = 'FI_T9' varname = 'F091001A' # 每股净资产 indname = 'Accper' colname = 'Stkcd' df = read_df_from_gta(tbname, varname, indname, colname) df.index.name = 't' df.index = pd.to_datetime(df.index) df.columns = df.columns.astype(str) df.columns.name = 'sid' save(df, 'bps')
def get_ff3M(): #fixme: there are some abnormal values df = read_gta('STK_MKT_ThrfacMonth') #trick:P9709 全部A股市场包含沪深A股和创业板 #trick:流通市值加权 df = df[df['MarkettypeID'] == 'P9709'][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1' ]] df.columns = ['t', 'rp', 'smb', 'hml'] df = df.set_index('t') df.index = freq_end(df.index, 'M') df.columns.name = 'type' save(df, 'ff3M') return df
def get_ffcM(): df = read_gta('STK_MKT_CarhartFourFactors') #trick: P9709 全部A股市场包含沪深A股和创业板 #trick: 流通市值加权 df = df[df['MarkettypeID'] == 'P9709'][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'mom'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'M') save(df, 'ffcM')
def get_ff3D(): tbname = 'STK_MKT_ThrfacDay' df = read_gta(tbname) condition1 = df['MarkettypeID'] == 'P9707' # P9709 全部A股市场包含沪深A股和创业板. # 流通市值加权 df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']] df.columns = ['t', 'rp', 'smb', 'hml'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'D') save(df, 'ff3D') return df
def get_bps_wind(): ''' from code generator by use w.wsd("000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ", "bps", "2017-02-04", "2018-03-05", "currencyType=;Period=Q;Fill=Previous") :return: ''' df = read_wind('bps', freq='M') df.index.name = 't' df.columns.name = 'sid' save(df, 'bps_wind')
def get_pu(): ''' policy uncertainty :return: ''' url = r'http://www.policyuncertainty.com/media/China_Policy_Uncertainty_Data.xlsx' pu = pd.read_excel(url, skip_footer=1) pu.columns = ['year', 'month', 'pu'] pu['t'] = pu['year'].map(str) + '-' + pu['month'].map(str) pu['t'] = freq_end(pu['t'], 'M') pu = pu.set_index('t') pu = pu['pu'] save(pu, 'pu')
def get_ff5M(): df = read_gta('STK_MKT_FivefacMonth') #trick:P9709 全部A股市场包含沪深A股和创业板 #trick:流通市值加权 #trick: 2*3 投资组合 df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'M') # df.index.name='t' save(df, 'ff5M')
def get_stockRetD(): # get stock daily stock return tbname = 'TRD_Dalyr' varname = 'Dretwd' #考虑现金红利再投资的收益 indname = 'Trddt' colname = 'Stkcd' df = read_df_from_gta(tbname, varname, indname, colname) df.index.name = 't' df.index = pd.to_datetime(df.index) #TODO: dayend? df.columns.name = 'sid' df.columns = df.columns.astype(str) save(df, 'stockRetD') return df
def get_ff6(): v1='size__size' v2='momentum__r12' smb,mom=two_sorting_factor(v1,v2,2,[0,0.3,0.7,1.0],sample_control=False, independent=True) mom.index.name='t' mom.name='mom' ff5=read_unfiltered('ff5M') ff6=pd.concat([ff5,mom],axis=1) ff6=ff6.dropna() ff6.columns.name='type' save(ff6,'ff6M')
def get_mktRetM(): tbname = 'TRD_Cnmont' indVar = 'Trdmnt' targetVar = 'Cmretwdos' #trick:考虑现金红利再投资的综合日市场回报率(流通市值加权平均法) df = read_gta(tbname) df = df[df['Markettype'] == 21] # 21=综合A股和创业板 df = df.set_index(indVar) df.index = freq_end(df.index, 'M') df.index.name = 't' s = df[targetVar] s.name = 'mktRetM' save(s, 'mktRetM')
def cal_beta(): dictD = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200, '24M': 450})#TODO: why so many months are lost? refer to betaD.csv dictM = OrderedDict({'12M': 10, '24M': 20, '36M': 24, '60M': 24}) combD,combM=_get_comb() betaD=groupby_rolling(combD,'D',dictD,_beta) betaM=groupby_rolling(combM,'M',dictM,_beta) betaD=betaD.stack().unstack(level=0) betaM=betaM.stack().unstack(level=0) #combine x = pd.concat([betaD, betaM], axis=1) x.index.names = ['t', 'sid'] x.columns.name = 'type' save(x,'beta',sort_axis=False)
def get_hxz4(): ''' calculate hxz4 factors,refer to din.py for details about the indicators References: Hou, K., Mo, H., Xue, C., and Zhang, L. (2018). Motivating Factors (Rochester, NY: Social Science Research Network). Returns: ''' v1 = 'size__size' v2 = 'inv__inv' #I/A v3 = 'roe__roe' # ROE comb = combine_with_datalagged([v1, v2, v3], sample_control=True) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], 2, range(1, 3))) comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v2], [0, 0.3, 0.7, 1.0], range(1, 4))) comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v3], [0, 0.3, 0.7, 1.0], range(1, 4))) assets = comb.groupby( ['t', 'g1', 'g2', 'g3']).apply(lambda df: my_average(df, 'stockEretM', wname='weight')) df1 = assets.groupby(['t', 'g1']).mean().unstack(level='g1') smb = df1[1] - df1[2] df2 = assets.groupby(['t', 'g2']).mean().unstack(level='g2') ria = df2[3] - df2[1] df3 = assets.groupby(['t', 'g3']).mean().unstack(level='g3') roe = df3[3] - df2[1] rp = load_data('rpM') hxz4 = pd.concat([rp, smb, ria, roe], axis=1, keys=['rp', 'smb', 'ria', 'roe']) hxz4.columns.name = 'type' hxz4 = hxz4.dropna() save(hxz4, 'hxz4M')
def get_capM(): ''' get stock monthly circulation market capitalization :return: ''' tbname = 'TRD_Mnth' varname = 'Msmvosd' #trick:月个股流通市值,单位 千元 # TODO:the unit convert it to million as Cakici, Chan, and Topyan, “Cross-Sectional Stock Return Predictability in China.” indname = 'Trdmnt' colname = 'Stkcd' df = read_df_from_gta(tbname, varname, indname, colname) df.index.name = 't' df.index = freq_end(df.index, 'M') df.columns = df.columns.astype(str) df.columns.name = 'sid' save(df, 'capM')
def get_mktRetD(): # get daily market return tbname = 'TRD_Cndalym' indVar = 'Trddt' targetVar = 'Cdretwdos' #trick 考虑现金红利再投资的综合日市场回报率(流通市值加权平均法) df = read_gta(tbname) condition1 = df['Markettype'] == 21 #trick 21=综合A股和创业板 df = df[condition1] df = df.set_index(indVar) df.index.name = 't' df.index = pd.to_datetime(df.index) s = df[targetVar] s.name = 'mktRetD' #TODO: put this line into check function or unify function? save(s, 'mktRetD')
def get_stockCloseD(): #get daily stock close price tbname = 'TRD_Dalyr' varname = 'Adjprcwd' # adjusted close price with dividend taken into consideration indname = 'Trddt' colname = 'Stkcd' df = read_df_from_gta(tbname, varname, indname, colname) df.index.name = 't' df.index = pd.to_datetime(df.index) df.columns.name = 'sid' df.columns = df.columns.astype(str) df = df.sort_index(axis=0) df = df.sort_index(axis=1) save(df, 'stockCloseD') return df
def get_beta_sw_dm(): ''' refer to page 5 of cakici for details about this beta. Returns: ''' #beta rf=read_filtered('rfD') rm=read_filtered('mktRetD') ri=read_filtered('stockRetD') df=ri.stack().to_frame() df.columns=['ri'] df=df.join(pd.concat([rf,rm],axis=1)) df.columns=['ri','rf','rm'] df.index.names=['t','sid'] df['y']=df['ri']-df['rf'] df['x2']=df['rm']-df['rf'] df['x1']=df.groupby('sid')['x2'].shift(1) def _cal_beta(x): result=sm.ols('y ~ x1 + x2',data=x).fit().params[['x1','x2']] return result.sum() def _for_one_sid(x): # x is multiIndex Dataframe nx=x.reset_index('sid') sid=nx['sid'][0] print(sid) _get_monthend=lambda dt:dt+MonthEnd(0) #filter out those months with observations less than MIN_SAMPLES nx=nx.groupby(_get_monthend).filter(lambda a: a.dropna().shape[0] >= MIN_SAMPLES) if nx.shape[0]>0: result=nx.groupby(_get_monthend).apply(_cal_beta) return result beta=df.groupby('sid').apply(_for_one_sid) beta.index.names=['sid','t'] beta=beta.reorder_levels(['t','sid']).sort_index(level='t') beta.name='beta' save(beta,'beta_sw_dm')
def get_ff3M_resset(): ''' from resset data :return: ''' tbname = 'THRFACDAT_MONTHLY' df = read_resset(tbname) # 'Exchflg == 0' 所有交易所 # 'Mktflg == A' 只考虑A股 df = df[(df['Exchflg'] == 0) & (df['Mktflg'] == 'A')] df = df.set_index('Date') df.index = freq_end(df.index, 'M') df.index.name = 't' df = df[['Rmrf_tmv', 'Smb_tmv', 'Hml_tmv']] #weighted with tradable capitalization df.columns = ['rp', 'smb', 'hml'] df.columns.name = 'type' save(df, 'ff3M_resset')