def before_trading(self, event): """ 默认在这里显式调用各个flow。 """ # 交易日,各个flow的输入日期都是交易日 trade_date = event.trading_dt.strftime("%Y%m%d") user_log.info(f"{trade_date}") # 如果store里面数量为None,那么就是要preheat,把store_flow里面充满 # 那么就preheat,silent=True store_num = self.user_context.DM.get_tensor( self.data_source.trading_dates.get_previous_trading_date( trade_date), "store_flow.store.factorReturn").data if store_num is None: for _date in self.data_source.trading_dates.get_previous_trading_date( trade_date, self.user_context.rolling_window - np.arange(self.user_context.rolling_window)): self.user_context.DM.load_tensor(_date) self._store_flow.run(_date) # 当预测时,假定是每天开盘前进行预测 self.user_context.DM.load_tensor(trade_date) self._store_flow.run(date=trade_date) self._prediction_stock.run(date=trade_date)
def before_trading(self, event): """ 默认在这里显式调用各个flow。 """ # 交易日,各个flow的输入日期都是交易日 trade_date = event.trading_dt.strftime("%Y%m%d") user_log.info(f"{trade_date}") # fit self._optim_flow.run(trade_date)
def before_trading(self, event): self.user_context.DM.load_tensor(event.trading_dt.strftime("%Y%m%d")) """ 默认在这里显式调用各个flow。 """ # 交易日,各个flow的输入日期都是交易日 trade_date = event.trading_dt.strftime("%Y%m%d") user_log.info(f"{trade_date}") # fit self._estimation_flow.run(trade_date)
def query(self, tablename, factors=[], features=[], **kwargs): """ 根据条件查询 返回 dataframe: cols如下: [factor,trade_date,featere1 ,featrue2,featrue3] :param tablename: :param factors: :param features: :return: """ startdate = kwargs.get("startdate", "20100101") enddate = kwargs.get("enddate", "22222222") sql = "select trade_date,factor,item,value from " + tablename + " where trade_date >= %s and trade_date <= %s " data = [] for feature in features: sql1 = sql + " and item = %s" self._re_conn() try: print(sql1) cur = self.conn.cursor() cur.execute(sql1, (startdate, enddate, feature)) results = cur.fetchall() self.conn.commit() results = pd.DataFrame( list(results), columns=['trade_date', 'factor', 'item', 'value']) results.index = [results["factor"], results["trade_date"]] if len(factors) > 0: results = results.ix[factors] results[feature] = results["value"] data.append(results[feature]) except Exception as e: user_log.warning(e) return False finally: self.close() user_log.info("query done,begin merge!") df = pd.concat(data, axis=1) df = df.reset_index(level=["factor", "trade_date"]) return df
def insert_feature(self, data, tablename, **kwargs): """ data: dataframe 对象,取["trade_date","factor","item","value"]]4列 :param data: :param tablename: :param kwargs: :return: """ # 检查数据格式 try: check_data = data[["trade_date", "factor", "item", "value"]] except Exception as e: user_log.error("data column incorrect,stop insert database") return False # 检查数据 inf 和 nan check_data = check_data.replace([np.inf, -np.inf], np.nan) check_data = check_data.where(check_data.notnull(), None) force = kwargs.get("force", False) user_log.info("insert into table {},data records {} ".format( tablename, len(check_data))) sql = "insert into " + tablename + "(trade_date,factor,item,value) values(%s,%s,%s,%s)" if force: sql = sql.replace("insert", "replace") fill_values = check_data.values.tolist() self._re_conn() try: cur = self.conn.cursor() cur.executemany(sql, fill_values) self.conn.commit() except Exception as e: user_log.warning(e) return False finally: self.close() return True
def delete_feature(self, tablename, factors=[], features=[], **kwargs): """ 删除rows, 如果 factors = [] ,则删除所有factors的记录 如果 featrure = [] ,则删除所有feature的记录 如果 同时为空,不进行删除 :param tablename: :param factors: :param features: :param kwargs: :return: """ if len(factors) == 0 and len(features) == 0: user_log.info("miss parameters") return False sql = "delete from " + tablename + " where " params = [] if len(factors) > 0: sql += " factor in %s and" params.append(factors) if len(features) > 0: sql += " item in %s and" params.append(features) sql = sql[:-4] self._re_conn() try: user_log.info("delete rows.....") cur = self.conn.cursor() cur.execute(sql, params) self.conn.commit() user_log.info("delete done") except Exception as e: user_log.warning(e) return False finally: self.close()
def _init_optim(self): prediction_flow = self.user_context.flow_config.get( "pred_flow_name", "prediction_stock") optim_flow = self.user_context.flow_config.get("optim_flow_name", "optim_flow") forward_return_flow = self.user_context.flow_config.get( "forward_return_flow", "flow_forward_return") self._optim_flow = SensorFlow(name=optim_flow, data_manager=self.user_context.DM) # module 19. 昨日持仓 self._optim_flow.add_next_step(sensor=GetHolding, args=["holding", [], {}], kwds={"account": self.account}) self._optim_flow.add_next_step(sensor=GetDate, args=["factor_as_of_date", [], {}], kwds={'offset': 1}) # module 11. 确定对Alpha/Risk数据进行数据清洗的集合 self._optim_flow.add_next_step( sensor=GetFundamentalPool, args=[ "stockCandidate", [ f"{optim_flow}.holding.weight", f"{optim_flow}.factor_as_of_date.date" ], {} ], kwds={ "pool_name": self.user_context.ff_name, "threshold": 0.3, # "benchmark_weight": "weight_index_500" }, silent=False) factorList = {} for k in self.user_context.alphaFactorDataFrame.factor_dataFrame.factor: factorList[k + "_f1"] = FACTOR_STYLE.ALPHA # module 7. 取alpha数据 self._optim_flow.add_next_step2( name="alphaPredData", sensor=GetFactorData, call=None, input_var=[f"{optim_flow}.factor_as_of_date.date"], kwds={"factorList": factorList}) # module 8. 取fitted_forward_return(也是用到未来数据) self._optim_flow.add_next_step2( name="fittedForwardReturnData", sensor=GetFactorData, call=None, input_var=[f"{optim_flow}.factor_as_of_date.date"], #kwds={"factorList": {'flow_estimation_fitted_f1': FACTOR_STYLE.ALPHA}} kwds={ "factorList": { 'fake_forward_return_f1': FACTOR_STYLE.ALPHA } }) # module 20. 股票权重优化 kwds = {} kwds.update(self.optim_options) user_log.info("check constraint:{}".format(kwds)) self._optim_flow.add_next_step( sensor=OptimizationStockWeight, args=[ "optimizationStockWeight", [ "%s.fittedForwardReturnData.exposure" % optim_flow, "%s.predictionFactorCovariance.factorCovariance" % prediction_flow, "%s.alphaPredData.exposure" % optim_flow, "%s.alphaPredData.factorName" % optim_flow, "%s.riskFactorData.exposure" % forward_return_flow, "%s.riskFactorData.factorName" % forward_return_flow, "%s.stockCandidate.pool" % optim_flow, "%s.holding.weight" % optim_flow, "%s.factor_as_of_date.date" % optim_flow ], { "%s.alphaPredData.exposure" % optim_flow: "alphaExposure", "%s.alphaPredData.factorName" % optim_flow: "alphaName", "%s.riskFactorData.exposure" % forward_return_flow: "riskExposure", "%s.riskFactorData.factorName" % forward_return_flow: "riskName", "%s.fittedForwardReturnData.exposure" % optim_flow: "stockReturn" } ], kwds=kwds, silent=True)
def _init_estimation_flow(self): flow_name = self.user_context.flow_config.get("est_flow_name", "est_flow") self._estimation_flow = SensorFlow(name=flow_name, data_manager=self.user_context.DM) # factor date self._estimation_flow.add_next_step2(name="factor_as_of_date", sensor=GetDate, kwds={'offset': self.user_context.forward_period + 2} ) # module 4. 确定对Risk数据进行数据清洗的集合 self._estimation_flow.add_next_step2(name="riskPool", sensor=GetPool, call=None, # 这里用factor_as_of_date input_var=[f"{flow_name}.factor_as_of_date.date"], kwds={"pool_name": self.user_context.pool_name}) factorList = {} for k in self.user_context.riskFactorDataFrame.factor_dataFrame.factor: factorList[k] = FACTOR_STYLE.SECTOR if k.startswith("industry") else FACTOR_STYLE.RISK # module 6. 取risk数据 self._estimation_flow.add_next_step2(name="riskFactorData", sensor=GetFactorData, call=None, input_var=[f"{flow_name}.riskPool.pool", f"{flow_name}.factor_as_of_date.date" ], kwds={"factorList": factorList, "data_process_methods": { FACTOR_STYLE.SECTOR: [], FACTOR_STYLE.RISK: [ DataProcessing.do_process_extremum_winsorize, DataProcessing.do_z_score_processing ] }}, silent=False) try: open_price_type = self.user_context.est_open_price except Exception as e: user_log.warning("no est_open_price in config file") open_price_type = "open_aft" try: close_price_type = self.user_context.est_close_price except Exception as e: user_log.warning("no est_close_price in config file") close_price_type = "open_aft" user_log.info("est open_price_type is : " + open_price_type) user_log.info("est close_price_type is : " + close_price_type) # module 8. 取return数据 self._estimation_flow.add_next_step2(name="returnData", sensor=GetReturnData, call=None, input_var=[f"{flow_name}.riskFactorData.exposure", f"{flow_name}.riskPool.pool", f"{flow_name}.factor_as_of_date.date" ], alias={ f"{flow_name}.riskFactorData.exposure": "neutralize_matrix", }, kwds={"data_process_methods": [ DataProcessing.do_process_extremum_winsorize, DataProcessing.neutrialize], "n": self.user_context.forward_period, "open_price_type": open_price_type, "close_price_type": close_price_type }, silent=True) self._estimation_flow.add_next_step2( name="saveToNpy_return", sensor=SaveToBundleSensor, call=None, input_var=[f"{flow_name}.factor_as_of_date.date", f"{flow_name}.returnData.stockReturn"], kwds={ 'bundle': self.user_context.config.base.data_bundle_path, 'suffix': 'f1', 'type': "return", 'name': "forward_return_%d" % self.user_context.forward_period } )
def query_groupby(self, tablename, features={}, **kwargs): """ 查询,对features进行groupby 操作, 例如:查询ic平均, 参数为features= {ic:SqlUtils.AVG} :param tabelename: :param features: :param kwargs: :return: """ startdate = kwargs.get("startdate", "20120101") enddate = kwargs.get("enddate", "20222222") user_log.info("query start - end : {} - {} ".format( startdate, enddate)) if len(features) == 0: user_log.warning("no features in paramter") return None result = [] for feature in features.keys(): for func in features[feature]: if type(func) == str: count_close_parentheses = func.count("(") + 1 temp = [')'] * count_close_parentheses sql = "select factor," + func + "(value" + "".join(temp) \ + " as " + feature + "_" + func.replace("(", "_") \ + " from " + tablename sql += " where trade_date >= %s and trade_date <= %s" sql += " and item = %s" sql += " group by factor" user_log.info(sql) self._re_conn() columns = [ "factor", feature + "_" + func.replace("(", "_") ] try: cur = self.conn.cursor() cur.execute(sql, (startdate, enddate, feature)) data = cur.fetchall() self.conn.commit() df = pd.DataFrame(list(data), columns=columns) df.index = df["factor"] result.append(df[columns[-1]]) except Exception as e: user_log.warning(e) user_log.warning( "error in query_groupby {}".format(feature)) finally: self.close() else: columns = [ "factor", feature + "_abs_gt_" + str(SqlUtils.GT_ABS_2) ] sql = "select factor, sum(if(abs(value) > %s,1,0))/count(value) " sql += " as " + columns[-1] sql += " from factor_main where " sql += " trade_date >= %s and trade_date <= %s and item = %s group by factor" user_log.info(sql) self._re_conn() try: cur = self.conn.cursor() cur.execute( sql, (SqlUtils.GT_ABS_2, startdate, enddate, feature)) data = cur.fetchall() self.conn.commit() df = pd.DataFrame(list(data), columns=columns) df.index = df["factor"] result.append(df[columns[-1]]) except Exception as e: user_log.warning(e) user_log.warning( "error in query_groupby {}".format(feature)) finally: self.close() df = pd.concat(result, axis=1) df = df.reset_index() return df
def do(self, date, mp, **kwargs): # region 读入参数, 对应输入的数据 # 优化器的参数 lambdax = kwargs.get("lambdax", 1) # lambda = 0.5? tc_a = kwargs.get("tc_a", 0.5) # 交易惩罚项中参数 tc_b = kwargs.get("tc_b", 1) # 交易惩罚项中参数 tc_power = kwargs.get("tc_power", 1.5) # 交易惩罚项中参数 tc_c = kwargs.get("tc_power", 0) n = kwargs.get("top", 200) # 前n个股票进入优化器 single_max = kwargs.get("single_max", 0.02) # 个股最大权重 total_value = kwargs.get("total_value", 1000000) # benchmark weight weight_index = kwargs.get("benchmark_weight", "weight_index_500") # 因子矩阵 column = mp.alphaName exog = mp.alphaExposure # 行业风格矩阵 risk_column = mp.riskName risk_factor = mp.riskExposure # 协方差矩阵 cov = mp.factorCovariance # 特质风险 if hasattr(mp, "sp_risk"): sp = mp.sp_risk else: sp = np.zeros_like(mp.stockReturn) # 停牌股票, non_suspend全是True/False, 没有nan is_suspend = kwargs.get("is_susp", np.full(mp.stockReturn.size, 0)) non_suspend = is_suspend == 0 # 计算benchmark因子暴露 benchmark_exposure = mp.data_manager.get_bar(date=mp.date, columns=[weight_index ])[weight_index] benchmark_exposure = np.nan_to_num(benchmark_exposure) / np.nansum( benchmark_exposure) benchmark_expo = np.dot(benchmark_exposure, np.nan_to_num(risk_factor)) # endregion success = False while (not success) and n < 1500: stock_return = mp.stockReturn.copy() stock_return[np.any(np.isnan(exog), axis=1)] = np.nan # region 计算进行优化的股票集合 # 1. mp.pool中计算top_flag # 2. holding | top_flag # 3. 因子不缺 # step 1. 在mp.pool中计算top_flag stock_return[ ~mp. pool] = np.nan # 这里在while-loop中虽然是重复计算,但是为了代码的可读性,还是放在loop里面 non_nan_cnt = np.sum(~np.isnan(stock_return)) if non_nan_cnt < n: self.logger.warning("non_nan_cnt(%s) < n(%s)" % (non_nan_cnt, n)) n = non_nan_cnt return_ordered_index = np.argsort(-stock_return)[:non_nan_cnt] top_flag = np.full(stock_return.size, False, dtype=bool) top_flag[return_ordered_index[:n]] = True candidates = top_flag.copy( ) # 在top_flag中的肯定是有predicted_stock_return的,所以数据肯定不缺失 # 在candidates中去掉其他nan的情况 # case 1. special_risk为nan # candidates &= ~np.isnan(sp) # 在candidates中还需要加入以下的情况 # case 1. 持仓 且 有stock_return (即数据不缺失) candidates |= (mp.weight > 0) & (~np.isnan(stock_return)) # to solve : 待求解变量w w = cp.Variable(np.sum(candidates)) # 持仓 且 停牌 且 数据缺失 holding_suspend = (mp.weight > 0) & (is_suspend == 1) & ( np.isnan(stock_return)) holding_suspend_sum = np.sum(mp.weight[holding_suspend]) candidates_cnt = np.nansum(candidates) # 以下的部分都是基于candidates_cnt的向量进行. # risk_matrix = risk_factor[candidates] x = exog[candidates] w0 = mp.weight[candidates] if any(holding_suspend): for ix, _ in enumerate(holding_suspend): if _: if any(np.isnan(exog[ix])): self.logger.warn( "Holding %s have nan factors %s" % (mp.data_manager.codes[ix], column[np.isnan( exog[ix]).ravel()])) if any(np.isnan(risk_factor[ix])): self.logger.warn("Holding %s have nan factors %s" % (mp.data_manager.codes[ix], risk_column[np.isnan( risk_factor[ix]).ravel()])) # constraint: weights < 1 - holding_suspend_sum constraints = [cp.sum(w) == 1 - holding_suspend_sum] # constraint: suspend locked weight_locked = (candidates & ~non_suspend)[candidates] if np.sum(weight_locked) >= 1: constraints += [w[weight_locked] == w0[weight_locked]] # constraint:for the non suspend, single_max constraint constraints += [w[~weight_locked] <= single_max] # constraint:for the non suspend, weight > 0 constraints += [w[~weight_locked] >= 0] # 3. 行业风格暴露约束,相对benchmark上界约束 risk_condition = kwargs.get("risk_condition", { "up": {}, "down": {} }) # constraint: risk expo control , ceil for k, v in risk_condition['up'].items(): col_index = risk_column == k expo = risk_factor[candidates][:, col_index] ceil = benchmark_expo[col_index] + v constraints += [ cp.sum(cp.multiply(np.ravel(expo), -w)) >= -ceil ] # constraint:risk expo control, floor for k, v in risk_condition['down'].items(): col_index = risk_column == k expo = risk_factor[candidates][:, col_index] floor = benchmark_expo[col_index] - v constraints += [ cp.sum(cp.multiply(np.ravel(expo), w)) >= floor ] try: # transaction cost terms as_of_date = mp.date z = w - w0 all_spread = mp.data_manager.get_bar( date=as_of_date, columns=["trade_spread_0935_1000"], codes=mp.data_manager.codes)["trade_spread_0935_1000"] all_trade_price = mp.data_manager.get_bar( date=as_of_date, columns=["trade_price_0935_1000_n"], codes=mp.data_manager.codes)["trade_price_0935_1000_n"] all_amount = mp.data_manager.get_bar( date=as_of_date, columns=["amount"], codes=mp.data_manager.codes)["amount"] all_tcost_sigma = \ mp.data_manager.get_bar(date=as_of_date, columns=["pct_std22"], codes=mp.data_manager.codes)[ "pct_std22"] all_a = tc_a * all_spread / all_trade_price #transaction cost: first term coefficient a = all_a[candidates] tcost_sigma = all_tcost_sigma[candidates] # transaction cost: second term coefficient c1 = tcost_sigma / np.sqrt( all_amount[candidates] / total_value) # missing transaction cost,use default : 0.003 ix = np.isnan(a) | np.isnan(c1) | np.isinf(c1) if ix.sum() > 0: self.logger.info("%s missing transaction cost" % ix.sum()) a[ix] = 0.003 c1[ix] = 0.0 # transaction cost: first term exp1 = cp.multiply(a, cp.abs(z)) # transaction cost: second term power = tc_power exp2 = tc_b * cp.multiply(c1, cp.abs(z)**power) # transaction cost: third term exp3 = tc_c * z tcost_expr = exp1 + exp2 + exp3 tcost_expr = cp.sum(tcost_expr) # predicted return term pred_returnp_expr = cp.sum( cp.multiply(stock_return[candidates], w)) assert (pred_returnp_expr.is_concave()) # risk term """ self.expression = cvx.sum_squares(cvx.multiply( np.sqrt(locator(self.idiosync, t).values), wplus)) + \ cvx.quad_form((wplus.T * locator(self.exposures, t).values.T).T, locator(self.factor_Sigma, t).values) """ risk_expr = 2 * lambdax * cp.sum(cp.quad_form( (w.T * x).T, cov)) assert (risk_expr.is_convex()) for el in constraints: assert (el.is_dcp()) prob = cp.Problem( cp.Maximize(pred_returnp_expr - risk_expr - tcost_expr), constraints) prob.solve(solver=cp.ECOS) if prob.status == "optimal" or prob.status == "optimal_inaccurate": user_log.info("status:{}".format(prob.status)) # user_log.info("w : {}".format(w.value)) user_log.info("sum(w):{}".format(np.sum(w.value))) target_weight = np.full(stock_return.size, 0, dtype=np.double) target_weight[holding_suspend] = mp.weight[holding_suspend] target_weight[candidates] = np.round(w.value, 6) # check expo # user_log.info("max(w):{}", np.max(w.value)) # user_log.info("min(w):{}", np.min(w.value)) # # import pandas as pd # expo = pd.DataFrame() # # diff = risk_factor[candidates].T.dot(w.value) - benchmark_expo # expo["factor"] = risk_column # expo["diff"] = diff # expo["abs"] = expo["diff"].abs() # expo = expo.sort_values(by="abs", ascending=False) # user_log.info(expo.head(50)) return target_weight, else: user_log.info("status: {}".format(prob.status)) user_log.warning( "optim failed at top n={} ,continue n+300".format(n)) n += 300 except Exception as e: import traceback traceback.print_exc() break target_weight = mp.weight return target_weight,