def sync_sw_industry_index(start_t: date, end_t: date, force_reinit: bool = False): tushare = TuShareProData(use_l3_cache=False) req_freq_controller = TushareReqSleepController(tushare) f_equity_quotation_daily = functools.lru_cache(maxsize=2)(tushare.cs_equity_quotation_daily) f_equity_basic_daily = functools.lru_cache(maxsize=2)(functools.partial(tushare.cs_equity_basic_daily, cols=["turnover_rate", "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm", "total_share", "float_share", "free_share", "total_mv", "circ_mv"])) df_sw_index = tushare.index_classify(src="SW") df_sw_index = df_sw_index.reset_index(drop=True) ls_df_index_equities = [] for idx_num, row in df_sw_index.iterrows(): lv = row["level"] ind_name = row["industry_name"] ind_code = row["index_code"] print( f"\r[sync_sw_industry_index {idx_num}] Calc {lv} industry '{ind_code}'({ind_name}) index data, reqs {tushare.query_orig_source_count}", end="") req_freq_controller.begin_internal_check() df_ind_index = _calc_sw_industry_data(tushare, ind_code, start_t, end_t, f_equity_quotation_daily, f_equity_basic_daily) req_freq_controller.end_internal_check() if df_ind_index is not None: tushare.ts_upsert_arctic_storage(TuShareProData.DERIVED_TS_INDUSTRY_INDEX, ind_code, df_ind_index, force_reinit=force_reinit)
def __init__(self, tushare_pro: TuShareProData): dict_all_symbol = tushare_pro.stock_basic( exchange="SSE").set_index("ts_code").to_dict("index") dict_all_symbol.update( tushare_pro.stock_basic( exchange="SZSE").set_index("ts_code").to_dict("index")) self._dict_all_symbols: Dict[str, Dict] = dict_all_symbol """ symbol 的详细信息内容"""
class SampleTushareDataGenerator: # 以下这些属于 Hyper Parameter,可以参与到 Tune lookback = 120 predict_count = 5 # 预测之后的N期数据 feature = ["close", "change", "vol", "amount"] target_column = "close" # 以下这些一般不参与到 tune x_symbols = ["600000.SH", "600028.SH", "600050.SH"] # 示意用,一般是板块的股票,这个symbol 的序是与 model 强关联的 y_symbol = "000001.SH" # 暂时假定在这个类中能够预测的一定是股票 def __init__(self, start_t: date, end_t: date): self.start_t = start_t # 从 pkl 读数据是另外一个 class 处理 # cache 的文件路径将根据 start_t , end_t , FEATURES 做 hash # 暂时根据 start / end 确定存盘路径,以后改成只有一份全部数据的 pickle self.tushare = TuShareProData() df_all_x = [ self.tushare.equity_quotation_daily(x, start=start_t, end=end_t, cols=self.feature) for x in self.x_symbols ] df_y = self.tushare.index_quotation_daily(self.y_symbol, start=start_t, end=end_t, cols=[self.target_column]) self.df_all_data = df_y for x_symbol, x_df in zip(self.x_symbols, df_all_x): self.df_all_data = self.df_all_data.join(x_df, rsuffix=f"_{x_symbol}") self.df_all_data.fillna(method="ffill", inplace=True) def x_shape(self) -> tf.TensorShape: return tf.TensorShape( [self.lookback, len(self.x_symbols) * len(self.feature)]).as_list() def __call__(self): for i in range( len(self.df_all_data) - self.lookback - self.predict_count - 1): yield ( self.df_all_data.iloc[i:i + self.lookback, 1:].to_numpy(), # [9, lookback] self.df_all_data[self.df_all_data.columns[0]] [i:i + self.predict_count].to_numpy())
def upsert_chn_stock_info_in_yahoo(): set_http_proxy() tushare = TuShareProData() df = tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"]) for idx, row in df.iterrows(): symbol = row["ts_code"].replace("SH", "SS") # YAHOO 上海股市的后缀是 SS logger.info(f"[{idx}] {symbol}({row['name']}) symbol data in yahoo") upsert_yahoo_financial_instrument_info(symbol, True) df = tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"]) for idx, row in df.iterrows(): symbol = row["ts_code"] # 深圳股市的后缀是相同的 logger.info(f"[{idx}] {symbol}({row['name']}) symbol data in yahoo") upsert_yahoo_financial_instrument_info(symbol, True)
def get_sdk(cls, wrapper_def: TSSDKWrapper, use_l3_cache: bool) -> ArcticAndLocalCacheBySymbol: cache_key = f"{wrapper_def.data_source}-{wrapper_def.data_source_auth}-{use_l3_cache}" if cache_key in cls._all_sdk_wrapper: return cls._all_sdk_wrapper[cache_key] if wrapper_def.data_source == "tushare_pro": from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData if wrapper_def.data_source_auth is None: _sdk = TuShareProData(use_l3_cache=use_l3_cache) else: _sdk = TuShareProData(wrapper_def.data_source_auth, use_l3_cache=use_l3_cache) cls._all_sdk_wrapper[cache_key] = _sdk return _sdk else: raise NotImplementedError
def upsert_chn_stock_name(): tushare = TuShareProData() df = tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"]) for idx, row in df.iterrows(): symbol = row["ts_code"].replace("SH", "SS") # YAHOO 上海股市的后缀是 SS symbol_obj = FinancialInstrumentSymbol(symbol=symbol, full_name=row['name']) upsert_document(symbol_obj, False) df = tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"]) for idx, row in df.iterrows(): symbol = row["ts_code"] # 深圳股市的后缀是相同的 symbol_obj = FinancialInstrumentSymbol(symbol=symbol, full_name=row['name']) upsert_document(symbol_obj, False)
def run_category_prediction(model_name: str, model_inst_gid: str, pred_ds_cfg_path: str, pred_name: str) -> pd.DataFrame: """ load model 并且执行 Prediction 的操作 """ assert os.path.isfile(pred_ds_cfg_path), logger.error(f" cfg file {pred_ds_cfg_path} is not existed!") pred_ds_workflow_cfg, pred_ds_workflow_context = load_mapping_from_file(pred_ds_cfg_path) pred_ds = create_step_by_dict(pred_ds_workflow_cfg, pred_ds_workflow_context) model_inst_path = ModelPathGeneratorStep(model_name, model_inst_gid) model_with_weight_step = ModelWithWeightSaveLoadStep(_input_steps=[model_inst_path]) # NOTE: 这里 predict 的 Input parameter 暂时先 hardcode, 以后考虑做到 Prediction 的 workflow 的其他参数项中 df = model_with_weight_step.predict(pred_ds.tf_ds, y_true_col_index=1, additional_cols=[ AdditionalColumnInDS(2, "symbol", TFDSSpecDataCodingType.utf8_str), AdditionalColumnInDS(3, "t", TFDSSpecDataCodingType.pd_timestamp)]) #叠加 额外的辅助列 from gs_research_workflow.common.path_utilities import _is_colab_env tushare = TuShareProData(use_l3_cache=_is_colab_env()) symbol_info_lookup = TushareSymbolToName(tushare) df["symbol_name"] = df.apply(partial(symbol_info_lookup, "name", "symbol"), axis=1) cat_label_mapping = CategoryIntToLabel(pred_ds) df["y_true_label"] = df.apply(partial(cat_label_mapping, "y_true"), axis=1) df["y_pred_label"] = df.apply(partial(cat_label_mapping, "y_pred"), axis=1) csv_path = os.path.join(get_prediction_output_path(model_name, pred_name), datetime.now().strftime("%Y%m%d_%H%M%S") + ".csv") df.to_csv(csv_path, header=True, index=True, encoding="utf-8-sig", quotechar="\"") return df
def prepare_index_portfolio_data(): import tushare as ts import time import yaml ts_wrapper = TuShareProData(use_l3_cache=True) # 可以使用的 market : "SSE" / "SZSE" / "CSI" # 没有 portfolio 信息的 market : "CICC" / "MSCI" / "SW" / "OTH" ls_index_has_weight_data = [] mkt_code = "OTH" dict_valid_index_code: Dict[str, Tuple[str, int]] = dict( ) # key : symbol code , value : (index_name,portfolio_count) yml_file_path = os.path.join("/tmp", f"{mkt_code}_index_member.yml") if os.path.isfile(yml_file_path): with open(yml_file_path, "r") as yaml_file: dict_valid_index_code = yaml.load(yaml_file) df_all_index = ts_wrapper.index_basic(market=mkt_code) print(f"{len(df_all_index)} indexes to query portfolio") for id_num, row in df_all_index.iterrows(): idx_code = row["ts_code"] if idx_code in dict_valid_index_code: continue df_index_member = ts_wrapper.index_weight(symbol=idx_code) if len(df_index_member) > 0: ls_index_has_weight_data.append(idx_code) print( f"index[{id_num}]:{idx_code}-{row['name']}-{len(df_index_member)}" ) else: if (id_num - len(ls_index_has_weight_data) + 1) % 5 == 0: print( f"sleep for empty data {id_num} , count {id_num - len(ls_index_has_weight_data) + 1}" ) time.sleep(5) dict_valid_index_code[idx_code] = (row['name'], len(df_index_member)) # 先简化代码,每次都 dump 一下数据 with open(yml_file_path, "w") as yaml_file: yaml.dump(dict_valid_index_code, yaml_file) print(f"total have index {len(ls_index_has_weight_data)}") print(ls_index_has_weight_data)
def cs_financial_statement_model_evaluate(): from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep # 显示所有列 pd.set_option('display.max_columns', None) # 显示所有行 pd.set_option('display.max_rows', None) # 设置value的显示长度为100,默认为50 pd.set_option('max_colwidth', 80) # stks = ChnEquityInputStep() # tf_ds_step = FinancialStatementCSMaskedTFDatasetStep(df_equities=stks.train_items, # ds_pip="lambda ds: ds.repeat().batch(20)") # tf_ds_step._ds_generator_call() # for ele in tf_ds_step.tf_ds.take(10): # print(ele) # y = model(ele[0]) # loss = gs_mean_absolute_error(ele[1], y) # print(loss) symbol = "600315.SH" tushare = TuShareProData(use_l3_cache=True) df_zscore = equity_all_financial_statement_zscore(tushare, symbol) comp_type = equity_comp_type(tushare, symbol) df_y_for_pred = df_zscore.iloc[-20:][:] df_y_true_original = equity_all_financial_statement_by_enddate(tushare, symbol)[-20:][:] input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input( df_y_for_pred, comp_type, False, True, False) # load model # model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), # model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=6, num_hidden_layers=10)) model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=12)) checkpoint_path = model_hp.check_point_path model = TSBertForMaskedCS.from_pre_saved(checkpoint_path) # add batch axis y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :], attention_mask_id[tf.newaxis, :])) np_y_pred = y_pred[0].numpy()[0] # 去掉 batch 维 np_y_pred = np_y_pred[1:, 0:df_y_for_pred.shape[1]] # 去掉 COMP_TYPE 维和 padding 的日期值 df_y_pred = pd.DataFrame(data=np_y_pred,index=df_y_for_pred.index, columns=df_y_for_pred.columns) # de zscore 回原始值 df_mean, df_std = equity_all_financial_statement_mean_and_std(tushare, symbol) df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std) # df_y_pred_orig_val = (df_y_for_pred/df_y_for_pred) *df_y_pred_orig_val delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1] delta_percentage = (df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1] # print(f"y_true:{df_y_true_original.iloc[-1]}") # print(f"y_pred:{df_y_pred_orig_val.iloc[-1]}") # print(f"delta_v:{delta_v}") print(f"delta_percentage:{delta_percentage.dropna().sort_values(ascending=True)}")
def equity_comp_type(tushare_sdk: TuShareProData, symbol: str) -> int: """获取公司的类型, 1一般工商业 2银行 3保险 4证券 """ df_income = tushare_sdk.income(symbol=symbol, report_type=1) comp_type = df_income["comp_type"].unique() rlt = 1 try: rlt = int(comp_type[0]) except: pass return rlt
def sync_equity_financial_statement(mkt_code: str = "SSE"): """上市公司的财务数据等""" os.environ["local_cache_expire_hours"] = "24" ts_wrapper = TuShareProData(use_l3_cache=False) df_stks = ts_wrapper.stock_basic(exchange=mkt_code, cols=["ts_code", "name"]) req_freq_controller = TushareReqSleepController(ts_wrapper) for id_num, row in df_stks.iterrows(): print(f"\rsync_equity_financial_statement {id_num} : {row['ts_code']} - {row['name']} , total reqs {ts_wrapper.query_orig_source_count} ", end="") symbol = row["ts_code"] req_freq_controller.begin_internal_check() ts_wrapper.income_by_enddate(symbol=symbol, to_single_period_val=True) ts_wrapper.balancesheet_by_enddate(symbol=symbol) ts_wrapper.cashflow_by_enddate(symbol=symbol, to_single_period_val=True) req_freq_controller.end_internal_check()
def sync_equity_to_sw_industry(sw_industry_lv: str, start_t: date, end_t: date, force_reinit: bool = False): lv_to_lib_name = {"L1": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L1, "L2": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L2, "L3": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L3} assert sw_industry_lv in lv_to_lib_name.keys() tushare = TuShareProData(use_l3_cache=False) req_freq_controller = TushareReqSleepController(tushare) df_sw_index = tushare.index_classify(level=sw_industry_lv, src="SW") df_sw_index = df_sw_index.reset_index(drop=True) ls_df_index_equities = [] i_num = 0 for idx_num, row in df_sw_index.iterrows(): i_num += 1 lv = row["level"] ind_name = row["industry_name"] ind_code = row["index_code"] req_freq_controller.begin_internal_check() df_index_equities = tushare.period_index_member(index_code=ind_code, resample_freq="B", start=start_t, end=end_t) req_freq_controller.end_internal_check() df_index_equities["index_code"] = ind_code ls_df_index_equities.append(df_index_equities) df_all_index_equities = pd.concat(ls_df_index_equities) for i, symbol in enumerate(df_all_index_equities["symbol"].unique()): df_symbol_in_industry = df_all_index_equities[df_all_index_equities["symbol"] == symbol] df_symbol_in_industry.drop(columns=["symbol"], inplace=True) print(f"\rsync_equity_to_sw_industry {i}:{symbol}-{df_symbol_in_industry.index.min()}-{df_symbol_in_industry.index.max()} ", end="") tushare.ts_upsert_arctic_storage(lv_to_lib_name[sw_industry_lv], symbol, df_symbol_in_industry, force_reinit=force_reinit)
def equity_all_financial_statement_by_enddate( tushare_sdk: TuShareProData, symbol: str, start_end_period: Tuple[Optional[date], Optional[date]] = (date(2008, 1, 1), date(2019, 12, 31)), to_single_period: bool = True) -> pd.DataFrame: df_income = tushare_sdk.income_by_enddate( symbol=symbol, to_single_period_val=to_single_period) df_balance_sheet = tushare_sdk.balancesheet_by_enddate(symbol=symbol) df_cashflow = tushare_sdk.cashflow_by_enddate( symbol=symbol, to_single_period_val=to_single_period) df_all = df_income.join(df_balance_sheet, how="left", lsuffix="_inc", rsuffix="_bs").join(df_cashflow, how="left", rsuffix="_cf") start, end = start_end_period df_all = _filter_df_by_start_end(df_all, start, end) return df_all
def __init__(self, start_t: date, end_t: date): self.start_t = start_t # 从 pkl 读数据是另外一个 class 处理 # cache 的文件路径将根据 start_t , end_t , FEATURES 做 hash # 暂时根据 start / end 确定存盘路径,以后改成只有一份全部数据的 pickle self.tushare = TuShareProData() df_all_x = [ self.tushare.equity_quotation_daily(x, start=start_t, end=end_t, cols=self.feature) for x in self.x_symbols ] df_y = self.tushare.index_quotation_daily(self.y_symbol, start=start_t, end=end_t, cols=[self.target_column]) self.df_all_data = df_y for x_symbol, x_df in zip(self.x_symbols, df_all_x): self.df_all_data = self.df_all_data.join(x_df, rsuffix=f"_{x_symbol}") self.df_all_data.fillna(method="ffill", inplace=True)
def sync_equity_cs_daily(start_t: date): os.environ["local_cache_expire_hours"] = "12" tushare = TuShareProData(use_l3_cache=False) df_index = tushare.index_quotation_daily("000001.SH", start=start_t, cols=["close"]) req_freq_controller = TushareReqSleepController(tushare) for idx_date, row in df_index.iterrows(): cs_t = date(idx_date.year, idx_date.month, idx_date.day) # 先只获取两个日期的数据 print(f"\rEquity CS data @{cs_t} , total reqs {tushare.query_orig_source_count} ", end="") req_freq_controller.begin_internal_check() tushare.cs_equity_quotation_daily(start=cs_t, end=cs_t) tushare.cs_equity_basic_daily(start=cs_t, end=cs_t) req_freq_controller.end_internal_check()
def _calc_sw_industry_data(tushare: TuShareProData, sw_industry_code: str, start_t: date, end_t: date, f_equity_quotation_daily=None, f_equity_basic_daily=None) -> pd.DataFrame: # 行业指数在 date 的持仓 df_stocks_in_industry = tushare.period_index_member(sw_industry_code, start=start_t, end=end_t, resample_freq="B") if df_stocks_in_industry is None: # 区间内没有股票,直接返回 return df_stocks_in_industry = df_stocks_in_industry.reset_index().set_index(keys=["date", "symbol"]) if f_equity_quotation_daily is None: f_equity_quotation_daily = tushare.cs_equity_quotation_daily if f_equity_basic_daily is None: f_equity_basic_daily = functools.partial(tushare.cs_equity_basic_daily, cols=["turnover_rate", "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm", "total_share", "float_share", "free_share", "total_mv", "circ_mv"]) # 每日指标 mv_weighted_cols = ["open", "high", "low", "close", "pre_close", "change", "pct_chg", "turnover_rate", "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm"] # 总市值加权的列 # sum_cols = ["vol", "amount", "total_share", "float_share", "free_share", "total_mv", "circ_mv"] # 得到个股的区间数据 df_cs_daily_industry_membership = df_stocks_in_industry.join( f_equity_quotation_daily(start=start_t, end=end_t), how="left").join( f_equity_basic_daily(start=start_t, end=end_t), how="left") # 市值加权列的加总计算 for col in mv_weighted_cols: df_cs_daily_industry_membership[f"_{col}"] = df_cs_daily_industry_membership[col] * \ df_cs_daily_industry_membership["total_mv"] df_cs_daily_industry_membership.drop(columns=mv_weighted_cols, inplace=True) df_cs_daily_industry_membership.rename(columns={f"_{col}": col for col in mv_weighted_cols}, inplace=True) df_cs_daily_industry_membership = df_cs_daily_industry_membership.reset_index().groupby("date").sum() for col in mv_weighted_cols: df_cs_daily_industry_membership[f"_{col}"] = df_cs_daily_industry_membership[col] / \ df_cs_daily_industry_membership["total_mv"] df_cs_daily_industry_membership.drop(columns=mv_weighted_cols, inplace=True) df_cs_daily_industry_membership.rename(columns={f"_{col}": col for col in mv_weighted_cols}, inplace=True) return df_cs_daily_industry_membership
def prepare_fund_portfolio_data(): import tushare as ts import yaml import time ts_pro = ts.pro_api( "8fe0d951588bf9b605de2cdce4a7b35a61c79ed3c6e128302dcca142") ts_wrapper = TuShareProData(use_l3_cache=True) # print(ts_wrapper.fund_nav("001753.OF", re_init=True)) # print(ts_wrapper.cs_fund_nav(end=date(2020, 1, 23), look_period=1)) # print(ts_pro.fund_nav(ts_code="001753.OF")) # 公募基金的累计净值,可以作为 x_feature 的 by value 部分 # print(ts_pro.fund_portfolio(ts_code="008140.OF")) # 公募基金的持仓作为 membership 的数据 # df = ts_pro.fund_basic(market="O") # --- 所关注的开放式基金列表 ---- # df = ts_wrapper.fund_basic(market="E") # df = df[~df["invest_type"].str.contains("货币型|黄金现货|债券型|原油主题|期货型")] # print(df["invest_type"].unique()) # print(df) # print(ts_wrapper.fund_portfolio("515070.SH")) # ------- # ---- 遍历有持仓数据的基金列表 ------ ls_fund_has_weight_data = [] mkt_code = "Fund_E" dict_valid_fund_code: Dict[str, Tuple[str, int]] = dict( ) # key : symbol code , value : (index_name,portfolio_count) yml_file_path = os.path.join("/tmp", f"{mkt_code}_portfolio.yml") if os.path.isfile(yml_file_path): with open(yml_file_path, "r") as yaml_file: dict_valid_fund_code = yaml.load(yaml_file) df_all_fund = ts_wrapper.fund_basic(market="E") df_valid_fund = df_all_fund[~df_all_fund["invest_type"].str. contains("货币型|黄金现货|债券型|原油主题|期货型")] print(f"{len(df_valid_fund)} funds to query portfolio") for id_num, row in df_valid_fund.iterrows(): fund_code = row["ts_code"] if fund_code in dict_valid_fund_code: continue df_fund_portfolio = ts_wrapper.fund_portfolio(symbol=fund_code) if len(df_fund_portfolio) > 0: ls_fund_has_weight_data.append(fund_code) print( f"fund [{id_num}]:{fund_code}-{row['name']}-{len(df_fund_portfolio)}" ) else: if (id_num - len(ls_fund_has_weight_data) + 1) % 5 == 0: print( f"sleep for empty data {id_num} , count {id_num - len(ls_fund_has_weight_data) + 1}" ) time.sleep(5) dict_valid_fund_code[fund_code] = (row['name'], len(df_fund_portfolio)) # 先简化代码,每次都 dump 一下数据 with open(yml_file_path, "w") as yaml_file: yaml.dump(dict_valid_fund_code, yaml_file)
# -*- coding: UTF-8 -*- from datetime import date import pandas as pd import tushare as ts from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData from gs_research_workflow.time_series.data.utilities import val_convert_to_zscore ts_pro = ts.pro_api("8fe0d951588bf9b605de2cdce4a7b35a61c79ed3c6e128302dcca142") gs_ts_pro = TuShareProData(use_l3_cache=True) # 带有 t 的 category 的格式应该为:(date , category , Set[symbol]) # # # 方法一: 先只传 symbol , 在 loop 的时候 random 出几个 属于某个 category 的类 def test_equity_concept(): df_j1 = gs_ts_pro.cs_equity_quotation_daily(start=None, end=date(2019, 12, 31), look_period=10) df_j2 = gs_ts_pro.cs_equity_adj_factor(start=None, end=date(2019, 12, 31), look_period=10) dfI = gs_ts_pro.cs_equity_basic_daily(start=None, end=date(2019, 12, 31), look_period=10, cols=["pe"]) # df = gs_ts_pro.cs_equity_moneyflow(start=None, end=date(2019, 12, 31), look_period=10) # df = gs_ts_pro.cs_equity_margin_detail(start=None, end=date(2019, 12, 31), look_period=10) # df = gs_ts_pro.cs_equity_block_trade(start=None, end=date(2019, 12, 31), look_period=10) # df = gs_ts_pro.cs_equity_top_inst(start=None, end=date(2019, 12, 31), look_period=10) # print(df_j1) # print(df_j2) dfI = dfI[(dfI["pe"] < 20.0) & (dfI["pe"] > 10.0)] print(dfI) df_j = pd.DataFrame({"adj_open": df_j1["open"] * df_j2["adj_factor"], "adj_close": df_j1["close"] * df_j2["adj_factor"]
def __post_init__(self): # todo: init tushare query object self._tushare = TuShareProData()
def for_notebook_eval_cs_financial_statement_mask(): from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS, TSBertName from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep from gs_research_workflow.common.serialization_utilities import cls_to_str from gs_research_workflow.time_series.data.utilities import de_zscore_to_val from gs_research_workflow.common.path_utilities import _DATA_ROOT import os import sys PRINT_HIGHLIGHT_STYLE = "\033[1;37;41m" # ---------- 不同的内容,只需要修改这一部分的参数项 --------- model_hp = TFModelStep( model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP( name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT, hidden_size=276, num_attention_heads=12) ) # model hp 这里只能修改 num_attention_heads:[6,12] 和 num_hidden_layers[8,12,16,20] # --------------------------------------------------------- checkpoint_path = os.path.join( _DATA_ROOT, "ModelData", model_hp.model_cls.__name__, model_hp.model_init_hp.get_hash_str( )) # 这里不能调用 TFModelStep.check_point_path() , 会创建目录的 if not os.path.isdir(checkpoint_path): print( PRINT_HIGHLIGHT_STYLE, f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters" ) raise RuntimeError( f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters" ) checkpoint_file = os.path.join(checkpoint_path, "tf_model.h5") if not os.path.exists(checkpoint_file): print(PRINT_HIGHLIGHT_STYLE, f"model weight file '{checkpoint_file}' is not existed") raise RuntimeError( f"model weight file '{checkpoint_file}' is not existed") model = TSBertForMaskedCS.from_pre_saved(checkpoint_path) # ------------------------------------------------- # 如果不需要更换 model ,只是换股票的话,只需要调整该 Cell symbol = "600315.SH" # 预测的股票 # ------------------------------------------------- # 这部分代码不需要修改,在变更了参数项之后重新执行即可 # 准备用于展示的数据 import pandas as pd from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData from gs_research_workflow.time_series.data.predefined_equity_apis import equity_all_financial_statement_zscore, \ equity_comp_type, equity_all_financial_statement_mean_and_std, equity_all_financial_statement_by_enddate from gs_research_workflow.time_series.gs_steps.tf_ds_for_financial_statement import \ FinancialStatementCSMaskedTFDatasetStep import tensorflow as tf pd.set_option('display.max_columns', None) # 显示所有列 pd.set_option('display.max_rows', None) # 显示所有行 pd.set_option('max_colwidth', 80) tushare = TuShareProData(use_l3_cache=True) df_zscore, series_mean, series_std = equity_all_financial_statement_zscore( tushare, symbol, ret_mean_and_std=True) comp_type = equity_comp_type(tushare, symbol) df_y_for_pred = df_zscore.iloc[-20:][:] # 暂时只提供预测已公布数据的最后一期值 df_y_true_original = equity_all_financial_statement_by_enddate( tushare, symbol)[-20:][:] input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input( df_y_for_pred, comp_type, series_std * 100., False, True, False) y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :], attention_mask_id[tf.newaxis, :])) # add batch axis np_y_pred = y_pred[0].numpy()[0] # 去掉 batch 维 np_y_pred = np_y_pred[ 1:, 0:df_y_for_pred.shape[1]] # 去掉 COMP_TYPE 维和 padding 的日期值 df_y_pred = pd.DataFrame(data=np_y_pred, index=df_y_for_pred.index, columns=df_y_for_pred.columns) # de zscore 回原始值 df_mean, df_std = equity_all_financial_statement_mean_and_std( tushare, symbol) df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std) delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1] delta_percentage = ( df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1] df_pred_summary = pd.DataFrame({ "true_val": df_y_true_original.iloc[-1], "pred_val": df_y_pred_orig_val.iloc[-1] }).dropna() df_pred_summary[ "delta_v"] = df_pred_summary["true_val"] - df_pred_summary["pred_val"] df_pred_summary["delta_percentage"] = (df_pred_summary["true_val"] - df_pred_summary["pred_val"]) * 100. / \ df_pred_summary["true_val"] df_pred_zscore = pd.DataFrame({ "true_val": df_zscore.iloc[-1], "pred_val": df_y_pred.iloc[-1] }).dropna() print(df_pred_summary)
if self._y_columns_count == 1: yield (self._x_df_data.iloc[i:i + self.x_loopback, :].to_numpy(), self._y_df_data[self._y_df_data.columns[0]] [i:i + self.y_predict_count].to_numpy()) else: yield ( self._x_df_data.iloc[i:i + self.x_loopback, :].to_numpy(), self._y_df_data.iloc[i:i + self.y_predict_count, :].to_numpy()) if __name__ == "__main__": from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData tushare = TuShareProData() gen = TSXYGenerator( x_symbols=["600000.SH", "600028.SH", "600050.SH"], y_symbol="000001.SH", x_get_data_objs=[ TSCallableData(tushare.equity_quotation_daily, ["close", "change", "vol", "amount"], "{symbol}_"), TSCallableData(tushare.equity_basic_daily, ["turnover_rate", "pe", "pb"], "{symbol}_") ], y_get_data_objs=[ TSCallableData(tushare.index_quotation_daily, ["close"], "{symbol}_") ], start_t=date(2010, 1, 1),
for symbol, s_category in self._all_samples: df = self._loop_get_data_and_join(symbol, self.x_get_data_objs, self.start_t, self.end_t) df = self.df_time_align.join(df) if self.f_fill_na: df = self.f_fill_na(df) # yield df[df.columns[1:]].to_numpy(), np.int(self.category_to_num(s_category)) yield df[df.columns[1:]].to_numpy(), np.array([int(self.category_to_num(s_category))]) if __name__ == "__main__": from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData, get_category_symbols, \ get_all_market_symbols import tensorflow as tf from tensorflow import keras tushare = TuShareProData() category_vocabs = ["密集调研", "高股息", "白马股", "养老金持股", "混改", "次新股", "回购", "兜底式增持", "筹码集中", "高送转", "社保重仓", "业绩预增", "股权争夺", "破发次新", "举牌股"] x_symbols_with_category, all_symbols_with_category = get_category_symbols(tushare, category_vocabs) all_stock_symbols = get_all_market_symbols(tushare) all_symbols_without_category = all_stock_symbols - all_symbols_with_category np.random.shuffle(x_symbols_with_category) split_ratio = 0.8 split_pos = int(len(x_symbols_with_category) * split_ratio) train_set, val_set = x_symbols_with_category[:split_pos], x_symbols_with_category[split_pos:] start_t = date(2018, 10, 1) end_t = date(2019, 10, 28) gen_kwargs = dict(x_symbols_with_category=train_set,