def _get_panel_stk_avg(self, comb, indicator, gcol): panel_stk_eavg = comb.groupby(['t', gcol ])['stockEretM'].mean() #equal weighted if self.factor == 'size': ''' when the factor is size,we also use the indicator (sort variable) as weight Refer to page 159. ''' panel_stk_wavg = comb.groupby([ 't', gcol ]).apply(lambda df: my_average(df, 'stockEretM', wname=indicator)) else: ''' the index denotes t+1,and the weight is from time t, since we have shift weight forward in dataset. ''' # def func(df): # return my_average(df,'stockEretM',wname='weight') # # panel_stk_wavg=comb.groupby(['t',gcol]).apply(func) panel_stk_wavg = comb.groupby([ 't', gcol ]).apply(lambda df: my_average(df, 'stockEretM', wname='weight')) return panel_stk_eavg, panel_stk_wavg
def construct_playingField(vars, model): ''' :param vars: list :param model: belong to {'5x5','2x4x4'} :return: ''' if model == '5x5': v1, v2 = tuple(vars) comb = data_for_bivariate(v1, v2, 5, 5, independent=True) assets=comb.groupby(['t','g1','g2']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g1','g2']) elif model == '2x4x4': #v1 must belong to size category v1, v2, v3 = tuple(vars) comb = combine_with_datalagged([v1, v2, v3]) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], 2, range(1, 3))) comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v2], 4, range(1, 5))) comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v3], 4, range(1, 5))) assets=comb.groupby(['t','g1','g2','g3']).apply( lambda df: my_average(df, 'stockEretM', wname='weight')) \ .unstack(level=['g1', 'g2','g3']) else: raise MyError('Model "{}" is not supported currently'.format(model)) return assets
def get_25assets(v1, v2): sampleControl = False q = 5 ss=[] for v in [v1,v2]: if v in Database(sample_control=sampleControl).all_indicators: s=Database(sample_control=sampleControl).by_indicators([v]) else: s=pd.read_pickle(os.path.join(dirFI,v+'.pkl')).stack() s.name=v ss.append(s) # data lagged weight = Database(sample_control=sampleControl).by_indicators(['weight']) datalagged = pd.concat(ss+[weight], axis=1) datalagged = datalagged.groupby('sid').shift(1) # data t datat = Database(sample_control=sampleControl).by_indicators(['stockEretM']) comb = pd.concat([datalagged, datat], axis=1) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], q)) comb['g2'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v2], q)) assets = comb.groupby(['t', 'g1', 'g2']).apply( lambda df: my_average(df, 'stockEretM', wname='weight'))\ .unstack(level=['g1','g2']) return assets
def indicator2factor(indicator): sampleControl = False q = 5 # data lagged df = _read(indicator) s = df.stack() s.name = indicator weight = Database(sample_control=sampleControl).by_indicators(['weight']) datalagged = pd.concat([s, weight], axis=1) datalagged = datalagged.groupby('sid').shift(1) # data t datat = Database(sample_control=sampleControl).by_indicators( ['stockEretM']) comb = pd.concat([datalagged, datat], axis=1) comb = comb.dropna() comb['g'] = comb.groupby( 't', group_keys=False).apply(lambda df: assign_port_id(df[indicator], q)) panel = comb.groupby(['t', 'g']).apply( lambda df: my_average(df, 'stockEretM', wname='weight')) \ .unstack(level=['g']) factor = panel[q] - panel[1] factor.name = indicator factor.to_pickle(os.path.join(factorPath, '{}.pkl'.format(indicator)))
def get_bivariate_panel(v1, v2='size__size'): sampleControl = False q = 5 ss = [] for v in [v1, v2]: if v in Database(sample_control=sampleControl).all_indicators: s = Database(sample_control=sampleControl).by_indicators([v]) else: s = _read(v).stack() s.name = v ss.append(s) # data lagged weight = Database(sample_control=sampleControl).by_indicators(['weight']) datalagged = pd.concat(ss + [weight], axis=1) datalagged = datalagged.groupby('sid').shift(1) # data t datat = Database(sample_control=sampleControl).by_indicators( ['stockEretM']) comb = pd.concat([datalagged, datat], axis=1) comb = comb.dropna() comb['g1'] = comb.groupby( 't', group_keys=False).apply(lambda df: assign_port_id(df[v1], q)) comb['g2'] = comb.groupby( 't', group_keys=False).apply(lambda df: assign_port_id(df[v2], q)) panel = comb.groupby(['t', 'g1', 'g2']).apply( lambda df: my_average(df, 'stockEretM', wname='weight'))\ .unstack(level=['g1','g2']) print(v1) return panel
def single_sorting_factor(indicator, q, weight=False): # method1 independent way ''' This function is used to construct a new factor by a given indicator. We first group stocks into "q" portfolios based on the rank of "indicator" every month.Then,at the next month we calculate the corresponding monthly value-weighted (if weight is True) portfolio return.The factor return is the spread between the return of the top portfolio and bottom portfolio. :param indicator: :param q: :param weight: :return:Series ''' if isinstance(q, int): labels = ['g{}'.format(i) for i in range(1, q + 1)] elif isinstance(q, (list, tuple)): labels = ['g{}'.format(i) for i in range(1, len(q))] else: raise MyError('q:"{}" is wrong!'.format(repr(q))) comb = combine_with_datalagged([indicator]) comb['g'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[indicator], q, labels)) if weight: panel=comb.groupby(['t','g']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g']) else: panel = comb.groupby(['t', 'g'])['stockEretM'].mean().unstack(level=['g']) factor = panel[labels[-1]] - panel[labels[0]] return factor
def _one_indicator(self, indicator): ns = range(1, 13) all_indicators = [indicator, 'weight', 'stockEretM'] comb = DATA.by_indicators(all_indicators) comb = comb.dropna() try: comb['g'] = comb.groupby( 't', group_keys=False).apply(lambda df: pd.qcut( df[indicator], self.q, labels=[indicator + str(i) for i in range(1, self.q + 1)], duplicates='raise')) except ValueError: #trick:qcut with non unique values https://stackoverflow.com/questions/20158597/how-to-qcut-with-non-unique-bin-edges comb['g'] = comb.groupby( 't', group_keys=False).apply(lambda df: pd.qcut( df[indicator].rank(method='first'), self.q, labels=[indicator + str(i) for i in range(1, self.q + 1)])) def _one_indicator_one_weight_type(group_ts, indicator): def _big_minus_small(s, ind): time = s.index.get_level_values('t')[0] return s[(time, ind + str(self.q))] - s[(time, ind + '1')] spread_data = group_ts.groupby('t').apply( lambda series: _big_minus_small(series, indicator)) s = risk_adjust(spread_data) return s eret = comb['eret'].unstack() s_es = [] s_ws = [] eret_names = [] for n in ns: eret_name = 'eret_ahead%s' % (n + 1) comb[eret_name] = eret.shift(-n).stack() group_eavg_ts = comb.groupby(['t', 'g'])[eret_name].mean() group_wavg_ts = comb.groupby( ['t', 'g']).apply(lambda df: my_average(df, eret_name, 'weight')) # group_wavg_ts = comb.groupby(['t', 'g']).apply( # lambda df: np.average(df[eret_name], weights=df['weight']))#fixme: what if there is nan values? #TODO: If we are analyzing size,the weights should be the indicator #we are analyzing,rather than weight s_e = _one_indicator_one_weight_type(group_eavg_ts, indicator) s_w = _one_indicator_one_weight_type(group_wavg_ts, indicator) s_es.append(s_e) s_ws.append(s_w) eret_names.append(eret_name) eq_table = pd.concat(s_es, axis=1, keys=eret_names) vw_table = pd.concat(s_ws, axis=1, keys=eret_names) return eq_table, vw_table
def _get_panel_stk_avg(self, comb, indicator, gcol): panel_stk_eavg=comb.groupby(['t',gcol])['eretM'].mean() if self.factor=='size': ''' when the factor is size,we also use the indicator (sort variable) as weight Refer to page 159. ''' panel_stk_wavg=comb.groupby(['t',gcol]).apply( lambda df:my_average(df,'eretM',wname=indicator) ) else: ''' the index denotes t+1,and the capM is from time t, since we have shift capM forward in dataset. ''' panel_stk_wavg = comb.groupby(['t', gcol]).apply( lambda df:my_average(df,'eretM',wname='capM') ) return panel_stk_eavg,panel_stk_wavg
def get_hxz4(): ''' calculate hxz4 factors,refer to din.py for details about the indicators References: Hou, K., Mo, H., Xue, C., and Zhang, L. (2018). Motivating Factors (Rochester, NY: Social Science Research Network). Returns: ''' v1 = 'size__size' v2 = 'inv__inv' #I/A v3 = 'roe__roe' # ROE comb = combine_with_datalagged([v1, v2, v3], sample_control=True) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], 2, range(1, 3))) comb['g2'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v2], [0, 0.3, 0.7, 1.0], range(1, 4))) comb['g3'] = comb.groupby(['t', 'g1'], group_keys=False).apply( lambda df: assign_port_id(df[v3], [0, 0.3, 0.7, 1.0], range(1, 4))) assets = comb.groupby( ['t', 'g1', 'g2', 'g3']).apply(lambda df: my_average(df, 'stockEretM', wname='weight')) df1 = assets.groupby(['t', 'g1']).mean().unstack(level='g1') smb = df1[1] - df1[2] df2 = assets.groupby(['t', 'g2']).mean().unstack(level='g2') ria = df2[3] - df2[1] df3 = assets.groupby(['t', 'g3']).mean().unstack(level='g3') roe = df3[3] - df2[1] rp = load_data('rpM') hxz4 = pd.concat([rp, smb, ria, roe], axis=1, keys=['rp', 'smb', 'ria', 'roe']) hxz4.columns.name = 'type' hxz4 = hxz4.dropna() save(hxz4, 'hxz4M')
def get_single_sorting_assets(indicator, q, weight=True): if isinstance(q, int): labels = ['g{}'.format(i) for i in range(1, q + 1)] elif isinstance(q, (list, tuple)): labels = ['g{}'.format(i) for i in range(1, len(q))] else: raise MyError('q:"{}" is wrong!'.format(repr(q))) comb = combine_with_datalagged([indicator]) comb['g'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[indicator], q, labels)) if weight: assets=comb.groupby(['t','g']).apply( lambda df:my_average(df,'stockEretM',wname='weight'))\ .unstack(level=['g']) else: assets = comb.groupby(['t', 'g'])['stockEretM'].mean().unstack(level=['g']) return assets
def three_sorting_factor(v1, v2, v3, q1, q2, q3, weight=True): ''' v1 and v2 are independent,v3 is conditional on v1 and v2 reference: page 18 of Pan, L., Tang, Y., and Xu, J. (2016). Speculative Trading and Stock Returns. Review of Finance 20, 1835–1865. ''' comb = combine_with_datalagged([v1, v2, v3]) comb = comb.dropna() comb['g1'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v1], q1, range(1, q1 + 1))) comb['g2'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[v2], q2, range(1, q2 + 1))) # g3 is conditional on g1 and g2 comb['g3'] = comb.groupby(['t', 'g1', 'g2'], group_keys=False).apply( lambda df: assign_port_id(df[v3], q3, range(1, q3 + 1))) if weight: s = comb.groupby([ 't', 'g1', 'g2', 'g3' ]).apply(lambda df: my_average(df, 'stockEretM', wname='weight')) else: s = comb.groupby(['t', 'g1', 'g2', 'g3'])['stockEretM'].mean() panel1 = s.groupby(['t', 'g1']).mean().unstack(level='g1') factor1 = panel1[q1] - panel1[1] panel2 = s.groupby(['t', 'g2']).mean().unstack(level='g2') factor2 = panel2[q2] - panel2[1] panel3 = s.groupby(['t', 'g3']).mean().unstack(level='g3') factor3 = panel3[q3] - panel3[1] return factor1, factor2, factor3
def two_sorting_factor(v1, v2, q1, q2, independent=True, weight=True, **kwargs): ''' just like the way we construct SMB and HML :param v1: :param v2: :param q1: :param q2: :param independent: sort independently or not :param weight: :return: a tuple of two Series ''' comb = data_for_bivariate(v1, v2, q1, q2, independent=independent, **kwargs) if weight: s = comb.groupby([ 't', 'g1', 'g2' ]).apply(lambda df: my_average(df, 'stockEretM', wname='weight')) else: s = comb.groupby(['t', 'g1', 'g2'])['stockEretM'].mean() panel1 = s.groupby(['t', 'g1']).mean().unstack(level='g1') factor1 = panel1[panel1.columns.max()] - panel1[1] panel2 = s.groupby(['t', 'g2']).mean().unstack(level='g2') factor2 = panel2[panel2.columns.max()] - panel2[1] return factor1, factor2
def indicatorDf_to_10_assets(indicatorDf, indicatorName): sampleControl = False q = 10 # data lagged s = indicatorDf.stack() s.name = indicatorName weight = Database(sample_control=sampleControl).by_indicators(['weight']) datalagged = pd.concat([s, weight], axis=1) datalagged = datalagged.groupby('sid').shift(1) # data t datat = Database(sample_control=sampleControl).by_indicators(['stockEretM']) comb = pd.concat([datalagged, datat], axis=1) comb = comb.dropna() comb['g'] = comb.groupby('t', group_keys=False).apply( lambda df: assign_port_id(df[indicatorName], q)) assets = comb.groupby(['t', 'g']).apply( lambda df: my_average(df, 'stockEretM', wname='weight')) \ .unstack(level=['g']) return assets
def _get_eret(self, comb): group_eavg_ts = comb.groupby(['g1', 'g2', 't'])['stockEretM'].mean() group_wavg_ts = comb.groupby( ['g1', 'g2', 't']).apply(lambda df: my_average(df, 'stockEretM', 'weight')) return group_eavg_ts, group_wavg_ts