def __compute_pnl( self, mode_pnl=None, mode_tcost=None, l_inst=None, verbose=True ): """ mode_pnl: 0: close pnl. 1: MOOMOC pnl. mode_tcost: 0: pure signal calculation only, 1: full tcost tiers calculation in pnl. Note mat_alpha is delayed twice for d1, once for d0 """ mode_pnl = solbasic.sync_mod_attr( mode_pnl, cfg, "mode_pnl" ) mode_tcost = solbasic.sync_mod_attr( mode_tcost, cfg, "mode_tcost" ) mat_alpha = self.mat.copy().shift(1).fillna(0) if self.d_configs[ "book_size" ] > 1: mat_alpha = np.round( mat_alpha * self.d_configs[ "book_size" ] / base.close.shift(2) ) * base.close.shift(2) mat_alpha.dropna( axis=1, how='all', inplace=True ) mat_tvr = np.abs( mat_alpha - mat_alpha.shift(1) ) cum_tvr = mat_tvr.sum().sum() if mode_pnl == 0: mat_pnl = mat_alpha.shift( self.d_configs[ "delay" ] ) * pnl.ret.ix[ mat_alpha.index, mat_alpha.columns ] * cfg.leverage elif mode_pnl == 1: mat_pnl = mat_alpha.shift( self.d_configs[ "delay" ] ) * pnl.ret_open_entry.ix[ mat_alpha.index, mat_alpha.columns ] * cfg.leverage else: solbasic.logger.warn( "Mode not supported." ) test.mat_raw = self.mat_raw.copy() test.mat_alpha = mat_alpha.copy() test.mat_pnl = mat_pnl.copy() self.tvr = ( 1 + cum_tvr / self.d_configs[ "book_size" ] ) / mat_alpha.shape[0] * 100 mat_pnl = mat_pnl.replace( [ np.inf, -np.inf ], np.nan ) self.v_pnl = mat_pnl.sum( axis=1 ) self.v_cum_pnl = self.v_pnl.cumsum() self.mat_pnl = mat_pnl if self.d_configs[ "mode_tcost" ]: if self.d_configs[ "delay" ] >= 0: if self.d_configs[ "book_size" ] != 1: mat_shares_traded = np.round( ( mat_tvr / base.close ) ) total_shares_traded = mat_shares_traded.sum().sum() mat_pnl100 = mat_pnl - mat_tvr * pnl.tcost50 - mat_shares_traded * cfg.commission - mat_tvr / 2 * 0.00002 else: mat_pnl100 = mat_pnl - mat_tvr * pnl.tcost50 - mat_tvr / 2 * 0.00002 else: solbasic.logger.warn( "Delay not supported. Use 0 or positive integer." ) mat_pnl100 = mat_pnl100.replace( [ np.inf, -np.inf ], np.nan ) mat_pnl100 = DataFrame( mat_pnl100, index=mat_pnl.index, columns=mat_pnl.columns ) self.v_pnl100 = mat_pnl100.sum( axis=1 ) self.v_cum_pnl100 = self.v_pnl100.cumsum() self.mat_pnl100 = mat_pnl100 v_tcost = self.v_pnl - self.v_pnl100 self.v_pnl25 = self.v_pnl - v_tcost * 0.25 self.v_pnl50 = self.v_pnl - v_tcost * 0.50 self.v_cum_pnl25 = self.v_pnl25.cumsum() self.v_cum_pnl50 = self.v_pnl50.cumsum() #self.mat_tcost = mat_pnl - mat_pnl100 self._set_alpha_stats( mode_tcost=self.d_configs[ "mode_tcost" ], l_inst=l_inst, verbose=verbose ) return self
def truncate(mat, frac=None): """ truncate the max position to be frac of total abs position. """ frac = solbasic.sync_mod_attr(frac, cfg, "f_trun_frac") mat2 = mat.copy() if frac < 0: frac = cfg.f_trun_frac sum_abs_row = np.abs(mat2).sum(axis=1) max_abs_row = np.abs(mat2).max(axis=1) for i in max_abs_row.index: if max_abs_row[i] <= frac * sum_abs_row[i] or np.isnan(max_abs_row[i]): continue else: row = mat2.ix[i] n_row_valid = len(row.dropna()) cnt = 0 while max_abs_row[ i] > frac * 1.1 * sum_abs_row[i] and cnt < n_row_valid: i_max = row[np.abs(row) == max_abs_row[i]].index[0] row[i_max] = (sum_abs_row[i] - np.abs(row[i_max])) * frac / ( 1 - frac) * np.sign(row[i_max]) sum_abs_row[i] = np.abs(row).sum() max_abs_row[i] = np.abs(row).max() cnt += 1 mat2.ix[i] = row return mat2
def demean(group, b_devol=None): if b_devol is None: b_devol = solbasic.sync_mod_attr(b_devol, cfg, "b_devol") if b_devol: return (group - group.mean()) / group.std() else: return group - group.mean()
def __init__(self, iden=None): self.iden = solbasic.sync_mod_attr(iden, cfg, "iden_stock") self.data_mgr = dmgr.dmgr() if cfg.db in ["xf", "qa", "fs"]: self.conn = sql.sqllink(cfg.db, "sqlalchemy") self.select_run_mode()
def _set_alpha_stats( self, mode_tcost=None, l_inst=None, verbose=True ): """ truncate and calculate relevant alpha statistics, given v_pnl, mat_pnl in self """ mode_tcost = solbasic.sync_mod_attr( mode_tcost, cfg, "mode_tcost" ) if l_inst and type( l_inst ) is list: l_inst = self.mat_raw.columns.intersection( l_inst ) mat_alpha = self.mat.copy().shift(1)[ l_inst ].fillna(0) if mat_alpha.dropna( how='all' ).dropna( how='all', axis=1 ).empty: return self.v_pnl = self.mat_pnl.ix[ self.v_pnl.index, l_inst ].sum( axis=1 ) if self.v_pnl[ self.v_pnl != 0 ].dropna().empty: return self.v_cum_pnl = self.v_pnl.cumsum() solbasic.logger.debug( "Computing stats on {} ...".format( l_inst ) ) else: mat_alpha = self.mat.copy().shift(1).fillna(0) mean_pnl = self.v_pnl.mean() / self.d_configs[ "book_size" ] * cfg.n_bdays self.volatility = self.v_pnl.std() / self.d_configs[ "book_size" ] * np.sqrt( cfg.n_bdays ) self.ir = mean_pnl / self.volatility self.volatility *= 100 self.ret = mean_pnl * 100 self.mdd, self.mdd_length, self.mdd_dates, self.mdd_sdate, self.mdd_edate = autils.max_dd( self.v_cum_pnl ) self.mdd /= self.d_configs[ "book_size" ] / 100. self.avg_pos_l = mat_alpha[ mat_alpha > 0 ].ix[1:].sum( axis=1 ).mean() self.avg_pos_s = mat_alpha[ mat_alpha < 0 ].ix[1:].sum( axis=1 ).mean() self.summary = "{:.2f} {:.2f} {:.2f} {:.2f}% {:.2f}% {:.2f}% {:.2f}% {} {}".format( self.avg_pos_l, self.avg_pos_s, self.ir, self.ret, self.volatility, self.tvr, self.mdd, self.mdd_length, self.mdd_dates ) if mode_tcost: if not hasattr( self, "v_pnl25" ): solbasic.logger.warn( "Requested to calculate alpha stats tiers, but v_pnl25 absent in alpha. Skip." ) return mean_pnl25 = self.v_pnl25.mean() / self.d_configs[ "book_size" ] * cfg.n_bdays self.volatility25 = self.v_pnl25.std() / self.d_configs[ "book_size" ] * np.sqrt( cfg.n_bdays ) self.ir25 = mean_pnl25 / self.volatility25 self.volatility25 *= 100 self.ret25 = mean_pnl25 * 100 self.mdd25, self.mdd25_length, self.mdd25_dates, self.mdd25_sdate, self.mdd25_edate = autils.max_dd( self.v_cum_pnl25 ) self.mdd25 /= self.d_configs[ "book_size" ] / 100. mean_pnl50 = self.v_pnl50.mean() / self.d_configs[ "book_size" ] * cfg.n_bdays self.volatility50 = self.v_pnl50.std() / self.d_configs[ "book_size" ] * np.sqrt( cfg.n_bdays ) self.ir50 = mean_pnl50 / self.volatility50 self.volatility50 *= 100 self.ret50 = mean_pnl50 * 100 self.mdd50, self.mdd50_length, self.mdd50_dates, self.mdd50_sdate, self.mdd50_edate = autils.max_dd( self.v_cum_pnl50 ) self.mdd50 /= self.d_configs[ "book_size" ] / 100. mean_pnl100 = self.v_pnl100.mean() / self.d_configs[ "book_size" ] * cfg.n_bdays self.volatility100 = self.v_pnl100.std() / self.d_configs[ "book_size" ] * np.sqrt( cfg.n_bdays ) self.ir100 = mean_pnl100 / self.volatility100 self.volatility100 *= 100 self.ret100 = mean_pnl100 * 100 self.mdd100, self.mdd100_length, self.mdd100_dates, self.mdd100_sdate, self.mdd100_edate = autils.max_dd( self.v_cum_pnl100 ) self.mdd100 /= self.d_configs[ "book_size" ] / 100. self.summary25 = "{:.2f} {:.2f} {:.2f} {:.2f}% {:.2f}% {:.2f}% {:.2f}% {} {}".format( self.avg_pos_l, self.avg_pos_s, self.ir25, self.ret25, self.volatility25, self.tvr, self.mdd25, self.mdd25_length, self.mdd25_dates ) self.summary50 = "{:.2f} {:.2f} {:.2f} {:.2f}% {:.2f}% {:.2f}% {:.2f}% {} {}".format( self.avg_pos_l, self.avg_pos_s, self.ir50, self.ret50, self.volatility50, self.tvr, self.mdd50, self.mdd50_length, self.mdd50_dates ) self.summary100 = "{:.2f} {:.2f} {:.2f} {:.2f}% {:.2f}% {:.2f}% {:.2f}% {} {}".format( self.avg_pos_l, self.avg_pos_s, self.ir100, self.ret100, self.volatility100, self.tvr, self.mdd100, self.mdd100_length, self.mdd100_dates )
def truncate_tvr(mat, frac=None): """ truncate the max turnover per stock lower / higher than pctl """ frac = solbasic.sync_mod_attr(frac, cfg, "f_trun_frac_tvr") if frac < 0: #solbasic.logger.warn( "frac={} invalid, set default {}.".format( frac, cfg.f_trun_frac ) ) frac = cfg.f_trun_frac mat_tvr = mat - mat.shift(1) mat_tvr[mat_tvr > frac] = frac mat_tvr[mat_tvr < -frac] = -frac mat2 = mat.shift(1) + mat_tvr return mat2
def winsorize_tvr(mat, q=None): """ truncate the max turnover per stock lower / higher than pctl """ q = solbasic.sync_mod_attr(q, cfg, "f_wins_pctl_tvr") mat2 = mat.copy() mat_tvr = mat2 - mat2.shift(1) mat_0 = DataFrame(0, index=mat_tvr.index, columns=mat_tvr.columns) mat_lower = mat_0.add(mat_tvr.quantile(q, axis=1), axis=0) mat_upper = mat_0.add(mat_tvr.quantile(1 - q, axis=1), axis=0) mat_tvr[mat_tvr < mat_lower] = mat_lower[mat_tvr < mat_lower] mat_tvr[mat_tvr > mat_upper] = mat_upper[mat_tvr > mat_upper] mat2 = mat2.shift(1) + mat_tvr return mat2
def decay(mat, n=None, mode="exponential"): """ decay alpha positions by n days """ n = solbasic.sync_mod_attr(n, cfg, "n_days_hold") if mode == "unweighted": mat2 = mat.rolling(window=n, min_periods=0).mean() elif mode == "exponential": mat2 = mat.ewm(span=n, min_periods=0).mean() elif mode == "linear": mat2 = mat.rolling(window=n, min_periods=0).apply(_decay_linear, args=(n, )) else: solbasic.logger.warn( "mode={} unsupported, try unweighted, exponential.".format(mode)) return mat2
def winsorize(mat, q=None): """ set extreme data points at two ends to q and 1-q percentile """ q = solbasic.sync_mod_attr(q, cfg, "f_wins_pctl") mat2 = mat.copy() mat_0 = DataFrame(0, index=mat.index, columns=mat.columns) mat_lower = mat_0.add(mat.quantile(q, axis=1), axis=0) mat_upper = mat_0.add(mat.quantile(1 - q, axis=1), axis=0) try: mat2[mat < mat_lower] = mat_lower[mat < mat_lower] mat2[mat > mat_upper] = mat_upper[mat < mat_upper] except TypeError: glb.logger.warn("Unable to winsorize. Type of mat: {}.".format( type(mat2))) return mat2
def create_cache( module, n_days_refresh=None ): """ create all matrices in module to data cache """ if cfg.b_dryrun: solbasic.logger.info( "Dry run. No create_cache write." ) return if not os.path.exists( cfg.path_cache ): os.makedirs( cfg.path_cache ) if cfg.sim_period in [ "forward", "live" ]: path_store = "{}{}_{}_forward.h5".format( cfg.path_cache, module.__name__.split('.')[-1], cfg.start_date ) else: path_store = "{}{}_{}_{}.h5".format( cfg.path_cache, module.__name__.split('.')[-1], cfg.start_date, cfg.end_date ) if cfg.b_refresh_cache: n_days_refresh = solbasic.sync_mod_attr( n_days_refresh, cfg, "n_days_refresh" ) solbasic.logger.info( "Refreshing {} ...".format( path_store ) ) append_store_mod( module, path_store, n_days_refresh=n_days_refresh ) else: solbasic.logger.info( "Creating {} ...".format( path_store ) ) remove_cache( module, verbose=False ) save_store_mod( module, path_store )
def dump_stats( alph_full, tree=None, b_dump_corr=False, b_dump_pos_only=None, b_recreate_alpha=None, book_size=2e7, b_update_db=False ): """ dump alpha statistics to alph.path. Must stay in lock while writing """ b_dump_pos_only = solbasic.sync_mod_attr( b_dump_pos_only, cfg, "b_dump_pos_only" ) b_recreate_alpha = solbasic.sync_mod_attr( b_recreate_alpha, cfg, "b_recreate_alpha" ) if cfg.b_dryrun: solbasic.logger.info( "Dry run. No dump_stats write." ) return if not hasattr( alph_full, "path" ): solbasic.logger.warn( "Alpha path does not exist, unable to dump stats." ) return if not os.path.exists( alph_full.path ): os.makedirs( alph_full.path ) alph = copy.deepcopy( alph_full ) alph = alph.simplify( mode="shallow" ) # dump daily position alph.mat_raw.index = pd.to_datetime( alph.mat_raw.index ) alph.mat.index = pd.to_datetime( alph.mat.index ) base.close.index = pd.to_datetime( base.close.index ) if not cfg.b_skip_daily_pos: path_alpha = "{}alpha/".format( alph.path ) path_alpha_10by10 = "{}alpha_10by10/".format( alph.path ) if not os.path.exists( path_alpha ): os.makedirs( path_alpha ) if not os.path.exists( path_alpha_10by10 ): os.makedirs( path_alpha_10by10 ) for index in alph.mat_raw.ix[ alph.d_configs[ "start_date_alpha" ]:alph.d_configs[ "end_date_alpha" ] ].index: index = str( index ).split()[0].replace( '-', '' ) path_alpha_i = "{}{}".format( path_alpha, index ) path_alpha_i_10by10 = "{}{}".format( path_alpha_10by10, index ) if not ( os.path.exists( path_alpha_i ) and os.path.exists( path_alpha_i_10by10 ) ) or b_recreate_alpha: if not os.path.exists( path_alpha_i ) and cfg.b_refresh_cache: solbasic.logger.debug( "Refreshing {} ...".format( index ) ) alph.mat.ix[ index ].fillna(0).to_csv( path_alpha_i, sep=' ' ) if index in base.close.index: np.round( alph.mat.ix[ index ] * book_size / base.close.ix[ index ] ).fillna(0).astype( int ).to_csv( path_alpha_i_10by10, sep=' ' ) # dump full alpha stats if not b_dump_pos_only: path_pnl = "{}pnl.csv".format( alph.path ) path_tvr = "{}tvr.csv".format( alph.path ) path_plot = "{}pnl.png".format( alph.path ) path_config = "{}config.yml".format( alph.path ) with open( path_config, 'w' ) as outfile: outfile.write( yaml.dump( alph.d_configs, default_flow_style=True ) ) alph.v_pnl.to_csv( path_pnl ) if hasattr( alph, "v_pnl25" ): alph.v_pnl25.to_csv( path_pnl.replace( "pnl.csv", "pnl25.csv" ) ) alph.v_pnl50.to_csv( path_pnl.replace( "pnl.csv", "pnl50.csv" ) ) alph.v_pnl100.to_csv( path_pnl.replace( "pnl.csv", "pnl100.csv" ) ) ( np.abs( alph.mat - alph.mat.shift(1) ) ).sum( axis=1 ).to_csv( path_tvr ) plot_pnl( [ alph ], fig_name=path_plot, verbose=False ) path_alph = "{}alp".format( alph.path ) if tree: # alpha cache cp_alph = copy.deepcopy( dict( pop_init=[ tree ], l_alph=[ alph ] ) ) pickle.dump( cp_alph, open( path_alph, 'wb' ), -1 ) else: # combo cache cp_alph = copy.deepcopy( dict( l_alph=[ alph ] ) ) pickle.dump( cp_alph, open( path_alph, 'wb' ), -1 )
def __init__( self, expr='', name='', d_configs=None, b_ops=True, b_stat=True, mat_raw=DataFrame() ): """ expr: alpha expression name: alpha name d_configs: preset alpha configs b_ops: run a series of operations on alphas b_stat: calculate performance metrics mat_raw: define alpha with matrix instead of expr. """ d_configs = solbasic.sync_mod_attr( d_configs, cfg, "d_configs" ) mat = DataFrame() self.expr = expr self.name = name self.expr_raw = expr self.d_configs = {} # simulation properties, mutable self.d_configs[ "start_date" ] = solbasic.get_dict( d_configs, "start_date", cfg.start_date ) self.d_configs[ "end_date" ] = solbasic.get_dict( d_configs, "end_date", cfg.end_date ) self.d_configs[ "sim_days_start" ] = solbasic.get_dict( d_configs, "sim_days_start", cfg.sim_days_start ) self.d_configs[ "sim_days_is_vs" ] = solbasic.get_dict( d_configs, "sim_days_is_vs", cfg.sim_days_is_vs ) self.d_configs[ "sim_days_end" ] = solbasic.get_dict( d_configs, "sim_days_end", cfg.sim_days_end ) self.d_configs[ "load_backdays" ] = solbasic.get_dict( d_configs, "load_backdays", cfg.load_backdays ) self.d_configs[ "mode_pnl" ] = solbasic.get_dict( d_configs, "mode_pnl", cfg.mode_pnl ) self.d_configs[ "mode_tcost" ] = solbasic.get_dict( d_configs, "mode_tcost", cfg.mode_tcost ) self.d_configs[ "book_size" ] = solbasic.get_dict( d_configs, "book_size", cfg.book_size ) # innate properties, immutable. Affects performance self.d_configs[ "dataset" ] = solbasic.get_dict( d_configs, "dataset", cfg.dataset ) self.d_configs[ "mode_neutral" ] = solbasic.get_dict( d_configs, "mode_neutral", cfg.mode_neutral ) self.d_configs[ "delay" ] = solbasic.get_dict( d_configs, "delay", cfg.delay ) self.d_configs[ "n_in_play" ] = solbasic.get_dict( d_configs, "n_in_play", cfg.n_in_play ) self.d_configs[ "f_wins_pctl" ] = solbasic.get_dict( d_configs, "f_wins_pctl", cfg.f_wins_pctl ) self.d_configs[ "f_trun_frac" ] = solbasic.get_dict( d_configs, "f_trun_frac", cfg.f_trun_frac ) self.d_configs[ "n_days_decay" ] = solbasic.get_dict( d_configs, "n_days_decay", cfg.n_days_decay ) self.d_configs[ "n_days_corr_pt" ] = solbasic.get_dict( d_configs, "n_days_corr_pt", cfg.n_days_corr_pt ) self.d_configs[ "n_days_corr_pnl" ] = solbasic.get_dict( d_configs, "n_days_corr_pnl", cfg.n_days_corr_pnl ) self.b_ops = b_ops self.b_stat = b_stat if self.expr: self.d_configs[ "d_fields_used" ] = autils.get_fields_used( self.expr ) try: self.mat_raw = eval( self.expr ) except AttributeError: solbasic.logger.warn( "Unable to evaluate {}, set mat_raw=1.".format( self.expr ) ) self.mat_raw = 1 elif not mat_raw.empty: self.mat_raw = mat_raw if hasattr( self, "mat_raw" ): self.init_mat_params() self.mat_raw *= univ.mat_in_play # set valid inst again to make sure. Not enough to truncate in dmgr. # apply ops and get stats if hasattr( self, "mat_raw" ) and not self.mat_raw.empty: if b_ops: self.ops() if b_stat: self.stats() # corner checks if self.d_configs[ "sim_days_start" ] != 0 and self.d_configs[ "sim_days_start" ] < self.d_configs[ "sim_days_end" ]: if not hasattr( glb, "b_warn_end_prior_start" ): solbasic.logger.warn( "pid: {}, alpha end_date {} prior to start_date {}. Force end at data end.".format( os.getpid(), self.d_configs[ "sim_days_start" ], self.d_configs[ "sim_days_end" ] ) ) if cfg.b_alphagen_on: glb.b_warn_end_prior_start = True self.d_configs[ "sim_days_end" ] = 1
def normalize(mat_alpha, book_size=None): """ normalize to book size """ book_size = solbasic.sync_mod_attr(book_size, cfg, "book_size") return mat_alpha.div((np.abs(mat_alpha)).sum(axis=1), axis=0) * book_size
def gics_groups(self, mode_neutral=None, n_days_refresh=None): """ prepare gics group classification """ mode_neutral = solbasic.sync_mod_attr(mode_neutral, cfg, "mode_neutral") if cfg.b_recreate_cache or not cutils.exists_cache(gicsg): solbasic.logger.info("Building gics group classification...") df_gicsg = self.exec_top(self.data_mgr.load_gics_groups, b_union=True) test.df_gicsg = df_gicsg.copy() if df_gicsg.empty: solbasic.logger.warn("Gics groups skipped.") return solbasic.logger.debug( "Gics groups original dimensions: {} days, {} instruments.". format(df_gicsg.gind.shape[0], df_gicsg.gind.shape[1])) if cfg.sim_mode == "cache": gicsg.gsector = df_gicsg.gsector.fillna(method='ffill').fillna( method='bfill') gicsg.gind = df_gicsg.gind.fillna(method='ffill').fillna( method='bfill') gicsg.gsubind = df_gicsg.gsubind.fillna(method='ffill').fillna( method='bfill') gicsg.naicsh = df_gicsg.naicsh.fillna(method='ffill').fillna( method='bfill') gicsg.sich = df_gicsg.sich.fillna(method='ffill').fillna( method='bfill') gicsg.spcindcd = df_gicsg.spcindcd.fillna( method='ffill').fillna(method='bfill') gicsg.spcseccd = df_gicsg.spcseccd.fillna( method='ffill').fillna(method='bfill') else: if mode_neutral == 1: gicsg.gsector = df_gicsg.gsector.fillna( method='ffill').fillna(method='bfill') elif mode_neutral == 2: gicsg.gind = df_gicsg.gind.fillna(method='ffill').fillna( method='bfill') elif mode_neutral == 3: gicsg.gsubind = df_gicsg.gsubind.fillna( method='ffill').fillna(method='bfill') else: gicsg.gsector = df_gicsg.gsector.fillna( method='ffill').fillna(method='bfill') gicsg.gind = df_gicsg.gind.fillna(method='ffill').fillna( method='bfill') gicsg.gsubind = df_gicsg.gsubind.fillna( method='ffill').fillna(method='bfill') cutils.create_cache(gicsg) else: solbasic.logger.info( "Loading gics group classification from cache...") gicsg.gsector = cutils.getattr_cache(gicsg, "gsector") gicsg.gind = cutils.getattr_cache(gicsg, "gind") gicsg.gsubind = cutils.getattr_cache(gicsg, "gsubind") gicsg.naicsh = cutils.getattr_cache(gicsg, "naicsh") gicsg.sich = cutils.getattr_cache(gicsg, "sich") gicsg.spcindcd = cutils.getattr_cache(gicsg, "spcindcd") gicsg.spcseccd = cutils.getattr_cache(gicsg, "spcseccd") solbasic.logger.info( "Gics groups loaded: {} days, {} instruments.".format( gicsg.gsubind.shape[0], gicsg.gsubind.shape[1]))