def add_one_equity_search_action(act_uuid: str, batch_action_uuid: str, generate_func: Callable, finished_triggered_func: Optional[Callable], equity_symbol: FinancialInstrumentSymbol, webapp_cfg: WebAppConfig, kw: str, additional_kw: str, category: str, sub_category: str, action_description: str) -> str: browser_action = GeneralBrowserActionInstance( uuid=act_uuid, from_workflow=TriggeredWebPagesCrawlWorkflow(uuid=batch_action_uuid), main_entity_type=EntityType.Equity.value, fin_instrument=equity_symbol, action_gen_func=cls_to_str(generate_func), gwa_cfg_name=webapp_cfg.full_cfg_name, gwa_kw=kw, gwa_additional_kw=additional_kw, action_category=category, action_sub_category=sub_category, action_description=action_description, ctime=datetime.now()) upsert_document(browser_action, False) act_obj = general_desktop_browser_backend_action( browser_action.gwa_cfg_name, browser_action.gwa_kw, browser_action.gwa_additional_kw) str_finished_triggered_func = "" if finished_triggered_func: str_finished_triggered_func = cls_to_str(finished_triggered_func) append_actions_into_dynamic_batch_action( batch_action_uuid, cls_to_str(GeneralBrowserBackendProcess.process_action_result), finished_triggered_func=str_finished_triggered_func, actions=[(act_obj, act_uuid, browser_action.action_description)]) return act_uuid
def create_action_run_msg(self, action_doc: RPAActionDoc) -> StatefulObjectAndCommitStream: act_uuid = action_doc.act_id act = create_stateful_object(act_uuid, WinRPAAction) act[WinRPAAction.action_uuid].VALUE = act_uuid act_obj: RPAAction = pickle.loads(action_doc.act) # 需要补充三项内容,创建者的 pk, class 以及 target pc act_obj.creator_cls = cls_to_str(self.__class__) act_obj.creator_uuid = self.pk assigned_pc = next(iter(self.all_managed_pcs)) # NOTE:这里先固定指派机器 act_obj.action_executor_required_tags.append(f"pc_id:{assigned_pc}") act_obj.ctime = datetime.now() act[WinRPAAction.action].VALUE = act_obj # 同步 mongo 中的信息 action_doc.status_flag = ActionStatusFlag.Running.value action_doc.add_to_exec_queue_t = datetime.now() action_doc.target_pc_id = assigned_pc upsert_document(action_doc, False) # 同步 env 的 state variable if self._actions_in_running.VALUE is None: self._actions_in_running.VALUE = dict() self._actions_in_running.VALUE[act_uuid] = datetime.now() self._actions_in_running.mark_changed() logger.info(f"start an RPA action {action_doc.act_id} - {action_doc.act_description}") return StatefulObjectAndCommitStream(act, self.uipath_action_stream)
def cs_bert_workflow(): equity_split_step = ChnEquityInputStep(train_val_split_ratio=0.9) train_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.repeat().batch(50)", _input_steps=[(equity_split_step, "train")]) val_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.repeat().batch(50)", _input_steps=[(equity_split_step, "validation")]) eval_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.batch(10)", _input_steps=[(equity_split_step, "evaluate")]) train_input_steps = [(train_ds_with_pip, "train_ds"), (val_ds_with_pip, "val_ds"), (eval_ds_with_pip, "test_ds")] model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP( name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT_CONST_MASK, hidden_size=FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT, num_attention_heads=12)) train_input_steps += [model_step] train_input_steps += [get_compile_step(loss="ts_bert_mae", metrics=["ts_bert_mae", "ts_bert_mse"])] train_input_steps += get_recommend_fit_with_callback_steps(epochs=5, steps_per_epoch=700, validation_steps=70) train_step = TFTrainStep(_input_steps=train_input_steps) # 作为一个标准的 workflow 进行保存 from gs_research_workflow.samples import workflow_cfg sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__), f"{TSBertForMaskedCS.__name__}_workflow_v1.yml") print(sample_file_path) save_mapping_to_file_or_stream(sample_file_path, train_step.get_init_value_dict(True), None) return train_step
def test_multi_symbol_ts(): multi_symbol_ts = SymbolMultipleTSStep( data_query_class=cls_to_str(TuShareProData), apis_and_columns={ "equity_basic_daily": ("fin_ind_", ["turnover_rate", "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm", "total_share", "free_share", "total_mv", "circ_mv"]), "equity_backward_adjust_daily": ( "backward_adj_", ["open", "high", "low", "close", "pre_close", "change", "pct_chg", "vol", "amount"]), "equity_moneyflow_daily": ("moneyflow_", ["buy_sm_vol", "buy_sm_amount", "sell_sm_vol", "sell_sm_vol", "sell_sm_amount", "buy_md_vol", "buy_md_amount", "sell_md_vol", "sell_md_amount", "buy_lg_vol", "buy_lg_amount", "sell_lg_vol", "sell_lg_amount", "buy_elg_vol", "buy_elg_amount", "sell_elg_vol", "sell_elg_amount", "net_mf_vol", "net_mf_amount"]) }, symbols=["600000.SH", "600050.SH"]) print(json.dumps(multi_symbol_ts.get_init_value_dict(out_self_cls=True))) print(multi_symbol_ts.ts_data[0].info()) print(multi_symbol_ts.ts_data[0].describe()) print(multi_symbol_ts.ts_data[0].T)
def save_compiled_model_args(self, model_full_cfg_file_path: str): # NOTE:compile_kwargs 里的 metrics 如果是 Metrics Class 的话,无法序列化成 json 和 pickles # 所以这里先暂时假定 compile_kwargs 里面的 metrics 都是 string 类型 obj_to_dump = (cls_to_str(self.model_cls), self.model_init_hp.get_init_value_dict(True), self.compile_kwargs) with open(model_full_cfg_file_path, "w") as f: json.dump(obj_to_dump, f)
def get_default_inception_model_classification_task_step(nb_classes: int) -> TFModelStep: model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTimeForClassification), model_hp=InceptionTimeForClassification.HP(nb_classes=nb_classes, inception_time_hp=InceptionTimeBlock.HP( depth=6, use_residual=True, inception_block_hp=InceptionBlock.HP( stride=1, use_bottleneck=True)))) return model_step
def cs_financial_statement_model_evaluate(): from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep # 显示所有列 pd.set_option('display.max_columns', None) # 显示所有行 pd.set_option('display.max_rows', None) # 设置value的显示长度为100,默认为50 pd.set_option('max_colwidth', 80) # stks = ChnEquityInputStep() # tf_ds_step = FinancialStatementCSMaskedTFDatasetStep(df_equities=stks.train_items, # ds_pip="lambda ds: ds.repeat().batch(20)") # tf_ds_step._ds_generator_call() # for ele in tf_ds_step.tf_ds.take(10): # print(ele) # y = model(ele[0]) # loss = gs_mean_absolute_error(ele[1], y) # print(loss) symbol = "600315.SH" tushare = TuShareProData(use_l3_cache=True) df_zscore = equity_all_financial_statement_zscore(tushare, symbol) comp_type = equity_comp_type(tushare, symbol) df_y_for_pred = df_zscore.iloc[-20:][:] df_y_true_original = equity_all_financial_statement_by_enddate(tushare, symbol)[-20:][:] input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input( df_y_for_pred, comp_type, False, True, False) # load model # model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), # model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=6, num_hidden_layers=10)) model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=12)) checkpoint_path = model_hp.check_point_path model = TSBertForMaskedCS.from_pre_saved(checkpoint_path) # add batch axis y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :], attention_mask_id[tf.newaxis, :])) np_y_pred = y_pred[0].numpy()[0] # 去掉 batch 维 np_y_pred = np_y_pred[1:, 0:df_y_for_pred.shape[1]] # 去掉 COMP_TYPE 维和 padding 的日期值 df_y_pred = pd.DataFrame(data=np_y_pred,index=df_y_for_pred.index, columns=df_y_for_pred.columns) # de zscore 回原始值 df_mean, df_std = equity_all_financial_statement_mean_and_std(tushare, symbol) df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std) # df_y_pred_orig_val = (df_y_for_pred/df_y_for_pred) *df_y_pred_orig_val delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1] delta_percentage = (df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1] # print(f"y_true:{df_y_true_original.iloc[-1]}") # print(f"y_pred:{df_y_pred_orig_val.iloc[-1]}") # print(f"delta_v:{delta_v}") print(f"delta_percentage:{delta_percentage.dropna().sort_values(ascending=True)}")
def __post_init__(self): self._tushare = SDKWrapperContainer.get_sdk_by_cls_name(cls_to_str(TuShareProData), {"use_l3_cache": _is_colab_env()}) df_all_equities = pd.concat([self._tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"]), self._tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"])]) self._df_eval = df_all_equities.sample(self.evaluate_items_count, random_state=self.random_state, axis=0).reset_index(drop=True) df_remain = df_all_equities[~df_all_equities["ts_code"].isin(self._df_eval["ts_code"].to_list())] self._df_train = df_remain.sample(frac=self.train_val_split_ratio, random_state=self.random_state, axis=0).reset_index(drop=True) self._df_val = df_remain[~df_remain["ts_code"].isin(self._df_train["ts_code"].to_list())].reset_index(drop=True)
def get_default_inception_with_attention_model_weight_prediction_task_step() -> TFModelStep: model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTimeWithAttentionForWeightPrediction), model_hp=InceptionTimeWithAttentionForWeightPrediction.HP( inception_attention_hp=InceptionTimeWithAttentionBlock.HP( depth=6, use_residual=True, use_attention_at_input=True, use_attention_at_each_inception=True, use_attention_after_residual=True, inception_block_hp=InceptionBlock.HP(stride=1, use_bottleneck=True) ) )) return model_step
def run_colab_experiment(template_file_name: str, experiment_name: str, trial_concurrency: int, max_trial_num: int, tuner: Tuner, cls_hp_alias, hps_to_display: List[str], search_space: List[Choice], vm=ExecTrialMachineType.Colab.value, port: int = 8080): import os cfg_path = f"samples/workflow_cfg/{template_file_name}" # 从 project root 开始算起的目录 yml_abs_path = os.path.join(os.path.dirname(__file__), "../../..", cfg_path) workflow_cfg, workflow_context = load_mapping_from_file(yml_abs_path) experiment = LocalExperiment(authorName="GS_GROUP", experimentName=experiment_name, trialConcurrency=trial_concurrency, maxExecDuration="168h", tuner=tuner, maxTrialNum=max_trial_num ) experiment.set_trial_module_and_args(gs_research_workflow.auto_ml.nni.hpo.trial_main_pod_side, cfg=cfg_path, pool="gs_google_acct_pool_1", name=experiment_name, vm=vm, cfg_alias=cls_to_str(cls_hp_alias)) if vm == ExecTrialMachineType.Colab.value: prepare_gdrive_folder(experiment_name) ls_tune_paras = [] # 以下是一些用于显示的重要 hp 的内容,不参与 tuning ,只保留一个 Choice 项 # 不参与 tuning 的超参项放在靠前的位置,可以让 nni 的 展示图形更好看一些 if hps_to_display: ls_tune_paras += [cfg_value_to_choice(workflow_cfg, workflow_context, cls_hp_alias, p) for p in hps_to_display] if search_space: ls_tune_paras += search_space exp_search_space = SearchSpace(parameters=ls_tune_paras) experiment.set_search_space(exp_search_space) print("-" * 50) # NOTE: 必须设置 VIRTUAL_ENV 的环境变量,以使得 nnictl 能够正常在 venv 中运行 nnictl_folder = "" if venv_path: os.environ["VIRTUAL_ENV"] = venv_path nnictl_folder = f"{venv_path}bin/" experiment.run(nnictl_folder=nnictl_folder, port=port)
def __post_init__(self): self._tushare = SDKWrapperContainer.get_sdk_by_cls_name(cls_to_str(TuShareProData), {"use_l3_cache": _is_colab_env()}) self._tf_ds = tf.data.Dataset.from_generator(self._ds_generator_call, output_types=( (tf.float32, tf.int32, tf.int32, tf.int32), tf.float32), output_shapes=( ( # input_ids(with mask) tf.TensorShape( [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH, FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT]), # position_ids tf.TensorShape( [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH]), # token_ids tf.TensorShape( [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH]), # attention_mask tf.TensorShape( [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH]) ), # y_true tf.TensorShape( [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH, FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT]), ) ) if self.ds_pip: self._tf_ds_with_pip = FuncStrStep(func_body=self.ds_pip, single_input=self._tf_ds).func_result self._f_financial_statement = functools.lru_cache(maxsize=2500)( functools.partial(equity_all_financial_statement_zscore, tushare_sdk=self._tushare, mean_base_t=None, start_end_period=(date(2008, 1, 1), date(2019, 12, 31)), ret_mean_and_std=True)) self._f_comp_type = functools.lru_cache(maxsize=2500)( functools.partial(equity_comp_type, tushare_sdk=self._tushare))
def get_init_value_dict(self, out_self_cls: bool = False) -> Mapping[str, Any]: """获取 init 的 dictionay 对象 Notes : 这里不作为 property , 避免产生一个与 dataclass 定义业务意义无关的 property 内容 Notes : 递归嵌套的 dict 关系是 lazy 产生的,init 过程中仅保留相关的数据链路关系 Parameters ---------- out_self_cls:bool 是否多输出一层当前 class 的内容 """ import copy from gs_research_workflow.core.gs_step_mapping import GlobalGSStepMapping # TODO: 这里需要 composition 的情况 init_dict_rlt = dict() if self._direct_init_field_value: for k, v in self._direct_init_field_value.items(): if isinstance(v, GSStep): init_dict_rlt[k] = v.get_init_value_dict(out_self_cls) else: init_dict_rlt[k] = copy.deepcopy(v) # init_dict_rlt = copy.deepcopy(self._direct_init_field_value) if self._ls_input_steps: for curr_step in self._ls_input_steps: field_mapping = GlobalGSStepMapping.get_registered( curr_step[0].__class__, self.__class__, curr_step[1]) # pip 的选项,使用的格式为 "#field1,field2# rule_name key = "#" + ",".join(field_mapping.field_names) + "#" init_dict_rlt[key] = curr_step[0].get_init_value_dict(True) init_dict_rlt[key][_KEY_PROPERTIES] = ",".join( field_mapping.property_names) if curr_step[1] is not None: init_dict_rlt[key][_KEY_RULE_NAME] = curr_step[1] if out_self_cls: return {cls_to_str(self.__class__): init_dict_rlt} else: return init_dict_rlt
def create_equity_workflow(req: WorkflowRequest): assert req.workflow_name in GSPredefinedWorkflow._value2member_map_ equity_entity = find_equity(req.entity_str) if equity_entity is None: wf_batch_uuid = md5_str( f"{req.request_from_account}-{req.ctime.isoformat()}-{req.entity_str}" ) doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WithError.value, error_msg=f"Can't find equity symbol or name by '{req.entity_str}'" ) upsert_document(doc_wf, False) return # 找到了 entity, 生成 workflow 的内容 wf_batch_uuid = md5_str( f"{equity_entity.symbol}-{req.workflow_name}-{req.para_begin}-{req.para_end}-{req.request_from_account}-{req.ctime.isoformat()}" ) # 查询 workflow 预设的更新频率 # wf_freq = "D" wf_freq = "1s" workflow_def = PredefinedWorkflow.objects( workflow_name=req.workflow_name).first() if workflow_def is not None: wf_freq = workflow_def.refresh_freq # 找一下该 symbol 的 workflow 最近一次的执行时间(假定 Per Symbol + Per Account) latest_workflow_inst = TriggeredWebPagesCrawlWorkflow.objects( fin_instrument=equity_entity.symbol, workflow=req.workflow_name, submit_account=req.request_from_account, finish_or_error_flag__in=[ WorkflowStatusFlag.WaitToRun.value, WorkflowStatusFlag.SuccessFinished.value ]).order_by("-submit_time").first() # 如果在同一个周期的,直接记录一条错误的记录内容 if latest_workflow_inst is not None and is_same_period( latest_workflow_inst.submit_time, req.ctime, wf_freq): logger.error( f"Workflow(uuid={latest_workflow_inst.uuid},ctime='{latest_workflow_inst.submit_time}') in the same period is existed." ) doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, main_entity_type=EntityType.Equity.value, fin_instrument=FinancialInstrumentSymbol( symbol=equity_entity.symbol), workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WithError.value, error_msg= f"workflow '{req.workflow_name}'({equity_entity.symbol}) is executed at {latest_workflow_inst.submit_time} . No need to rerun now." ) upsert_document(doc_wf, False) return # 创建一个workflow doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, main_entity_type=EntityType.Equity.value, fin_instrument=FinancialInstrumentSymbol(symbol=equity_entity.symbol), workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WaitToRun.value) upsert_document(doc_wf, False) # 创建 batch action doc_batch_action = RPABatchAction( batch_id=wf_batch_uuid, is_dynamic_batch=True, from_function=cls_to_str(create_equity_workflow), ctime=req.ctime, status=ActionStatusFlag.WaitingForRun.value) upsert_document(doc_batch_action, False) # 依次调用 action generator 函数 # NOTE : 这里是直接访问 diction , 以后改为调用函数,就可以支持 register 的功能 for func in WORKFLOW_NAME_TO_ACTION_GENERATORS.get(req.workflow_name, []): func(equity_entity, wf_batch_uuid) logger.info(f"Batch action '{wf_batch_uuid}' is created.")
def cs_bert_equity_daily_workflow(): start_t = date(2019, 1, 1) end_t = date(2019, 12, 31) i_t = IByTGeneratorStep(start_t=start_t, end_t=end_t - timedelta(days=92), sample_freq="2w", train_val_split_ratio=0.95, evaluate_items_count=1, use_concept_blocks=False, ls_i_by_condition=[("low_pe", "pe > 3.0 and pe < 8.0"), ("mid_pe", "pe > 15.0 and pe < 30.0"), ("high_pe", "pe > 30.0 and pe < 80.0"), ("low_pb", "pb >= 0.6 and pb <= 0.8"), ("mid_pb", "pb >= 0.9 and pb <= 1.1"), ("high_pb", "pb >= 1.3 and pb <= 1.8"), ("sml_cap", "total_mv >= 5.0e5 and total_mv < 5.0e6"), ("mid_cap", "total_mv >= 8.0e6 and total_mv < 2.0e7"), ("large_cap", "total_mv >= 2.0e7") ]) # EquityPoolTSDatasetStep(df_i_by_t=i_t.pool_by_t, i_start_t=start_t, i_end_t=end_t, # ds_pip="lambda ds: ds.repeat().batch(8)") train_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.repeat().batch(5)", i_start_t=start_t, i_end_t=end_t, _input_steps=[(i_t, "train")]) val_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.repeat().batch(5)", i_start_t=start_t, i_end_t=end_t, _input_steps=[(i_t, "validation")]) eval_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.batch(3)", i_start_t=start_t, i_end_t=end_t, _input_steps=[(i_t, "evaluate")]) train_input_steps = [(train_ds_with_pip, "train_ds"), (val_ds_with_pip, "val_ds"), (eval_ds_with_pip, "test_ds")] # model = TSBertForMaskedCS( # hp=TSBertForMaskedCS.HP(hidden_size=EquityPoolTSDatasetStep.MAX_INDICATORS, # # 多一个作为 padding 的0 # max_position_embeddings=EquityPoolTSDatasetStep.LOOK_PERIOD_ITEMS + 1, # type_vocab_size=EquityPoolTSDatasetStep.MAX_ENTITIES_PER_INST + 1, # num_attention_heads=12)) model_name = TSBertName.CHN_EQUITY_DAILY_PREDICT_RETURN_LESS_INDICATORS model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP( name=model_name, hidden_size=EquityPoolTSDatasetStep.MAX_INDICATORS, max_position_embeddings=EquityPoolTSDatasetStep.LOOK_PERIOD_ITEMS + 1, type_vocab_size=EquityPoolTSDatasetStep.MAX_ENTITIES_PER_INST + 1, num_attention_heads=12) ) train_input_steps += [model_step] train_input_steps += [ get_compile_step(loss="mae_align_to_y_true", metrics=["mae_align_to_y_true", "mse_align_to_y_true"])] # train_input_steps += get_recommend_fit_with_callback_steps(epochs=2, steps_per_epoch=10000, validation_steps=150) # train_input_steps += get_recommend_fit_with_callback_steps(epochs=1, steps_per_epoch=14000, validation_steps=200) train_input_steps += get_recommend_fit_with_callback_steps(epochs=3, steps_per_epoch=5000, validation_steps=200) # train_input_steps += get_recommend_fit_with_callback_steps(epochs=2, steps_per_epoch=20, validation_steps=4) train_step = TFTrainStep(_input_steps=train_input_steps) # 作为一个标准的 workflow 进行保存 from gs_research_workflow.samples import workflow_cfg sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__), f"{model_name}_workflow_v1.yml") print(sample_file_path) save_mapping_to_file_or_stream(sample_file_path, train_step.get_init_value_dict(True), None) return train_step
def get_default_ts_bert_for_weight_prediction_task_step() -> TFModelStep: model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForWeightPrediction), model_hp=TSBertForWeightPrediction.HP(hidden_size=72)) # hidden_size 必须与 lookback_period 相同 return model_step
def for_notebook_eval_cs_financial_statement_mask(): from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS, TSBertName from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep from gs_research_workflow.common.serialization_utilities import cls_to_str from gs_research_workflow.time_series.data.utilities import de_zscore_to_val from gs_research_workflow.common.path_utilities import _DATA_ROOT import os import sys PRINT_HIGHLIGHT_STYLE = "\033[1;37;41m" # ---------- 不同的内容,只需要修改这一部分的参数项 --------- model_hp = TFModelStep( model_cls_str=cls_to_str(TSBertForMaskedCS), model_hp=TSBertForMaskedCS.HP( name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT, hidden_size=276, num_attention_heads=12) ) # model hp 这里只能修改 num_attention_heads:[6,12] 和 num_hidden_layers[8,12,16,20] # --------------------------------------------------------- checkpoint_path = os.path.join( _DATA_ROOT, "ModelData", model_hp.model_cls.__name__, model_hp.model_init_hp.get_hash_str( )) # 这里不能调用 TFModelStep.check_point_path() , 会创建目录的 if not os.path.isdir(checkpoint_path): print( PRINT_HIGHLIGHT_STYLE, f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters" ) raise RuntimeError( f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters" ) checkpoint_file = os.path.join(checkpoint_path, "tf_model.h5") if not os.path.exists(checkpoint_file): print(PRINT_HIGHLIGHT_STYLE, f"model weight file '{checkpoint_file}' is not existed") raise RuntimeError( f"model weight file '{checkpoint_file}' is not existed") model = TSBertForMaskedCS.from_pre_saved(checkpoint_path) # ------------------------------------------------- # 如果不需要更换 model ,只是换股票的话,只需要调整该 Cell symbol = "600315.SH" # 预测的股票 # ------------------------------------------------- # 这部分代码不需要修改,在变更了参数项之后重新执行即可 # 准备用于展示的数据 import pandas as pd from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData from gs_research_workflow.time_series.data.predefined_equity_apis import equity_all_financial_statement_zscore, \ equity_comp_type, equity_all_financial_statement_mean_and_std, equity_all_financial_statement_by_enddate from gs_research_workflow.time_series.gs_steps.tf_ds_for_financial_statement import \ FinancialStatementCSMaskedTFDatasetStep import tensorflow as tf pd.set_option('display.max_columns', None) # 显示所有列 pd.set_option('display.max_rows', None) # 显示所有行 pd.set_option('max_colwidth', 80) tushare = TuShareProData(use_l3_cache=True) df_zscore, series_mean, series_std = equity_all_financial_statement_zscore( tushare, symbol, ret_mean_and_std=True) comp_type = equity_comp_type(tushare, symbol) df_y_for_pred = df_zscore.iloc[-20:][:] # 暂时只提供预测已公布数据的最后一期值 df_y_true_original = equity_all_financial_statement_by_enddate( tushare, symbol)[-20:][:] input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input( df_y_for_pred, comp_type, series_std * 100., False, True, False) y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :], attention_mask_id[tf.newaxis, :])) # add batch axis np_y_pred = y_pred[0].numpy()[0] # 去掉 batch 维 np_y_pred = np_y_pred[ 1:, 0:df_y_for_pred.shape[1]] # 去掉 COMP_TYPE 维和 padding 的日期值 df_y_pred = pd.DataFrame(data=np_y_pred, index=df_y_for_pred.index, columns=df_y_for_pred.columns) # de zscore 回原始值 df_mean, df_std = equity_all_financial_statement_mean_and_std( tushare, symbol) df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std) delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1] delta_percentage = ( df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1] df_pred_summary = pd.DataFrame({ "true_val": df_y_true_original.iloc[-1], "pred_val": df_y_pred_orig_val.iloc[-1] }).dropna() df_pred_summary[ "delta_v"] = df_pred_summary["true_val"] - df_pred_summary["pred_val"] df_pred_summary["delta_percentage"] = (df_pred_summary["true_val"] - df_pred_summary["pred_val"]) * 100. / \ df_pred_summary["true_val"] df_pred_zscore = pd.DataFrame({ "true_val": df_zscore.iloc[-1], "pred_val": df_y_pred.iloc[-1] }).dropna() print(df_pred_summary)
symbols="000001.SH", cols=["close"]) x_ts_data_step = SymbolTSStep(api="equity_backward_adjust_daily", cols=[ "open", "high", "low", "close", "pre_close", "change", "pct_chg", "vol", "amount" ]) train_val_tf_ds_step = DELTSCategoryMultiPeriodDatasetStep(_input_steps=[ train_val_set, (time_align_step, "time_align"), (x_ts_data_step, "x_data_callable") ]) train_ds_step = FuncStrStep(func_body="lambda ds: ds.repeat().batch(10)") val_ds_step = FuncStrStep(func_body="lambda ds: ds.repeat().batch(10)") model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTime), model_hp=InceptionTime.HP(nb_classes=3)) compile_step = CompileStep(loss="categorical_crossentropy", optimizer="Adam", metrics=['accuracy']) fit_step = FitStep(epochs=10, steps_per_epoch=4500, validation_steps=110) checkpoint_step = ModelCheckPointStep(save_best_only=True, verbose=1) tensor_board_step = TensorBoardStep(write_graph=False) train_step = TFTrainStep(_input_steps=[ train_val_tf_ds_step, (train_ds_step, "train_pip_line"), (val_ds_step, "val_pip_line"), model_step, compile_step, fit_step, checkpoint_step, tensor_board_step ])
def category_by_membership_data(workflow_context: Dict = MARKET_CAP_INDEX_MEMBERSHIP_WORKFLOW_DEFAULT_CONTEXT, category_api: str = "index_weight", category_symbol_cols: List[str] = ["con_code"], train_start_t: date = date(2014, 1, 1), train_end_t: date = date(2019, 10, 31), test_start_t: date = date(2019, 11, 1), test_end_t: date = date(2019, 12, 1), train_val_split_ratio: float = 0.85, random_state: Optional[int] = 100) \ -> Tuple[TSCategoryDatasetPreparingStep, TSCategoryDatasetPreparingStep, TSCategoryDatasetPreparingStep]: """ Returns ------- - train_ds_step , val_ds_step , test_ds_step """ assert "LOCAL" in workflow_context assert "category_labels" in workflow_context["LOCAL"] assert "category_by_index_membership" in workflow_context["LOCAL"] assert "x_feature_query_class" in workflow_context["LOCAL"] assert "x_features_per_symbol" in workflow_context["LOCAL"] all_context_step = { k: GetContextStep(k) for k in workflow_context["LOCAL"].keys() } for k, v in all_context_step.items(): v.SET_CONTEXT(workflow_context) kv_convert_step = KeyValueListToMappingStep( _input_steps=[(all_context_step["category_labels"], "key_list"), (all_context_step["category_by_index_membership"], "value_list")]) # training 用到的数据集和 test 用到的数据集,通过不同的时间段进行分开 # 约有 28K 个数据点 train_index_membership_ts_step = SymbolTSStep(api=category_api, cols=category_symbol_cols, _input_steps=[ (kv_convert_step, "symbols") ], start_t=train_start_t, end_t=train_end_t) test_index_membership_ts_step = SymbolTSStep(api=category_api, cols=category_symbol_cols, _input_steps=[ (kv_convert_step, "symbols") ], start_t=test_start_t, end_t=test_end_t) train_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat), _input_steps=[ (train_index_membership_ts_step, "ts_process") ]) test_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat), _input_steps=[ (test_index_membership_ts_step, "ts_process") ]) train_val_set = TrainValSpiltStep(_input_steps=[(train_concat_df_step, "train_val_orig_data")], split_ratio=train_val_split_ratio, random_state=random_state) # 这里 hardcode 成用上证综指作为 time align 对象,以后做不同市场才考虑将该数据放开 time_align_step = SymbolTSStep(api="index_quotation_daily", symbols="000001.SH", cols=["close"]) x_orig_data_step = SymbolMultipleTSStep(_input_steps=[( all_context_step["x_feature_query_class"], "data_query_class" ), (all_context_step["x_features_per_symbol"], "apis_and_columns")]) x_feature_callable_step = TSPeriodTSByLookbackStep( _input_steps=[(x_orig_data_step, "period_ts_callable"), (time_align_step, "time_align")]) train_ds_with_pip = TSCategoryDatasetPreparingStep( export_symbol_in_ds=False, export_t_in_ds=False, _input_steps=[ x_feature_callable_step, (train_val_set, "train_set"), (all_context_step["category_labels"], "category_labels"), (all_context_step["train_val_ds_pip"], "ds_pip") ]) val_ds_with_pip = TSCategoryDatasetPreparingStep( export_symbol_in_ds=False, export_t_in_ds=False, _input_steps=[ x_feature_callable_step, (train_val_set, "val_set"), (all_context_step["category_labels"], "category_labels"), (all_context_step["train_val_ds_pip"], "ds_pip") ]) test_ds_with_pip = TSCategoryDatasetPreparingStep( export_symbol_in_ds=False, export_t_in_ds=False, _input_steps=[ x_feature_callable_step, test_concat_df_step, (all_context_step["category_labels"], "category_labels"), (all_context_step["test_ds_pip"], "ds_pip") ]) return train_ds_with_pip, val_ds_with_pip, test_ds_with_pip
from gs_research_workflow.time_series.gs_steps.func_steps import FuncStrStep from gs_research_workflow.time_series.gs_steps.ts_data_steps import SymbolTSStep, SymbolMultipleTSStep from gs_research_workflow.time_series.gs_steps.data_structure_utility_steps import KeyValueListToMappingStep from gs_research_workflow.time_series.gs_steps.local_context_step import GetContextStep MARKET_CAP_INDEX_MEMBERSHIP_WORKFLOW_DEFAULT_CONTEXT = { "LOCAL": { "category_labels": ["BigCap", "MidCap", "SmlCap"], "category_by_index_membership": ["000043.SH", "000044.SH", "000045.SH"], # x 的 feature 的数据,来自于哪个 class "x_feature_query_class": cls_to_str(TuShareProData), # 与股票直接相关的 features "x_features_per_symbol": { "equity_basic_daily": ("fin_ind_", [ "turnover_rate", "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm", "total_share", "free_share", "total_mv", "circ_mv" ]), "equity_backward_adjust_daily": ("backward_adj_", [ "open", "high", "low", "close", "pre_close", "change", "pct_chg", "vol", "amount" ]), "equity_moneyflow_daily": ("moneyflow_", [ "buy_sm_vol", "buy_sm_amount", "sell_sm_vol", "sell_sm_amount", "buy_md_vol", "buy_md_amount", "sell_md_vol", "sell_md_amount", "buy_lg_vol", "buy_lg_amount", "sell_lg_vol", "sell_lg_amount",
def category_prediction_workflow(): WORKFLOW_CONTEXT = { "LOCAL": { "category_labels": ["BigCap", "MidCap", "SmlCap"], "category_by_index_membership": ["000043.SH", "000044.SH", "000045.SH"], "x_feature_from_api": "equity_backward_adjust_daily", "x_feature_columns": [ "open", "high", "low", "close", "pre_close", "change", "pct_chg", "vol", "amount" ], "pred_ds_pip": "lambda ds: ds.batch(10)", "y_start_t": date(2019, 11, 1), "y_end_t": date(2019, 12, 1), } } all_context_step = { k: GetContextStep(k) for k in WORKFLOW_CONTEXT["LOCAL"].keys() } for k, v in all_context_step.items(): v.SET_CONTEXT(WORKFLOW_CONTEXT) kv_convert_step = KeyValueListToMappingStep( _input_steps=[(all_context_step["category_labels"], "key_list"), (all_context_step["category_by_index_membership"], "value_list")]) test_index_membership_ts_step = SymbolTSStep( api="index_weight", cols=["con_code"], _input_steps=[(kv_convert_step, "symbols"), (all_context_step["y_start_t"], "start_t"), (all_context_step["y_end_t"], "end_t")]) time_align_step = SymbolTSStep(api="index_quotation_daily", symbols="000001.SH", cols=["close"]) test_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat), _input_steps=[ (test_index_membership_ts_step, "ts_process") ]) x_orig_data_step = SymbolTSStep( _input_steps=[(all_context_step["x_feature_from_api"], "api"), (all_context_step["x_feature_columns"], "cols")]) x_feature_callable_step = TSPeriodTSByLookbackStep( _input_steps=[(x_orig_data_step, "period_ts_callable"), (time_align_step, "time_align")]) pred_ds_with_pip = TSCategoryDatasetPreparingStep( export_symbol_in_ds=True, export_t_in_ds=True, _input_steps=[ x_feature_callable_step, test_concat_df_step, (all_context_step["category_labels"], "category_labels"), (all_context_step["pred_ds_pip"], "ds_pip") ]) # save test ds pip # NOTE: 这里只需要保存 prediction_ds 的workflow 即可,有关 model 可以直接定义在 run 的 env 里 from gs_research_workflow.samples import workflow_cfg sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__), "category_prediction_ds_workflow_v1.yml") print(sample_file_path) save_mapping_to_file_or_stream(sample_file_path, pred_ds_with_pip.get_init_value_dict(True), WORKFLOW_CONTEXT) # 这里是 用来验证 # # UUID="808CCED2DF57AE1BC7030C9B57F9A23A" for debug-73 model_inst_path = ModelPathGeneratorStep( InceptionTime.__name__, "F24E10E3C3C556FC3FDC0C4B18EFA3C5") model_with_weight_step = ModelWithWeightSaveLoadStep( _input_steps=[model_inst_path]) df = model_with_weight_step.predict( pred_ds_with_pip.tf_ds, y_true_col_index=1, additional_cols=[ AdditionalColumnInDS(2, "symbol", TFDSSpecDataCodingType.utf8_str), AdditionalColumnInDS(3, "t", TFDSSpecDataCodingType.pd_timestamp) ]) print(df)
# 适用于 function 搭建 pip line GlobalGSStepMapping.register( FuncStrStep, FuncStrStep, rule_name="single_ret_pip", diff_name={FuncStrStep.func_result: FuncStrStep.single_input}) GlobalGSStepMapping.register( FuncStrStep, FuncStrStep, rule_name="args_ret_pip", diff_name={FuncStrStep.func_result: FuncStrStep.args}) GlobalGSStepMapping.register( FuncStrStep, FuncStrStep, rule_name="kwargs_ret_pip", diff_name={FuncStrStep.func_result: FuncStrStep.kwargs}) reg_fields_from_local_step(FuncStrStep) if __name__ == "__main__": def print_x(x): print(x) # f_step = FuncStrStep(func_body="lambda x: print(x)") f_step = FuncStrStep(func_obj_str=cls_to_str(print_x)) f_step.func("abc")