Beispiel #1
0
def add_one_equity_search_action(act_uuid: str, batch_action_uuid: str,
                                 generate_func: Callable,
                                 finished_triggered_func: Optional[Callable],
                                 equity_symbol: FinancialInstrumentSymbol,
                                 webapp_cfg: WebAppConfig, kw: str,
                                 additional_kw: str, category: str,
                                 sub_category: str,
                                 action_description: str) -> str:
    browser_action = GeneralBrowserActionInstance(
        uuid=act_uuid,
        from_workflow=TriggeredWebPagesCrawlWorkflow(uuid=batch_action_uuid),
        main_entity_type=EntityType.Equity.value,
        fin_instrument=equity_symbol,
        action_gen_func=cls_to_str(generate_func),
        gwa_cfg_name=webapp_cfg.full_cfg_name,
        gwa_kw=kw,
        gwa_additional_kw=additional_kw,
        action_category=category,
        action_sub_category=sub_category,
        action_description=action_description,
        ctime=datetime.now())
    upsert_document(browser_action, False)

    act_obj = general_desktop_browser_backend_action(
        browser_action.gwa_cfg_name, browser_action.gwa_kw,
        browser_action.gwa_additional_kw)
    str_finished_triggered_func = ""
    if finished_triggered_func:
        str_finished_triggered_func = cls_to_str(finished_triggered_func)
    append_actions_into_dynamic_batch_action(
        batch_action_uuid,
        cls_to_str(GeneralBrowserBackendProcess.process_action_result),
        finished_triggered_func=str_finished_triggered_func,
        actions=[(act_obj, act_uuid, browser_action.action_description)])
    return act_uuid
    def create_action_run_msg(self, action_doc: RPAActionDoc) -> StatefulObjectAndCommitStream:
        act_uuid = action_doc.act_id
        act = create_stateful_object(act_uuid, WinRPAAction)
        act[WinRPAAction.action_uuid].VALUE = act_uuid

        act_obj: RPAAction = pickle.loads(action_doc.act)
        # 需要补充三项内容,创建者的 pk, class 以及 target pc
        act_obj.creator_cls = cls_to_str(self.__class__)
        act_obj.creator_uuid = self.pk
        assigned_pc = next(iter(self.all_managed_pcs))  # NOTE:这里先固定指派机器
        act_obj.action_executor_required_tags.append(f"pc_id:{assigned_pc}")
        act_obj.ctime = datetime.now()
        act[WinRPAAction.action].VALUE = act_obj

        # 同步 mongo 中的信息
        action_doc.status_flag = ActionStatusFlag.Running.value
        action_doc.add_to_exec_queue_t = datetime.now()
        action_doc.target_pc_id = assigned_pc
        upsert_document(action_doc, False)

        # 同步 env 的 state variable
        if self._actions_in_running.VALUE is None:
            self._actions_in_running.VALUE = dict()
        self._actions_in_running.VALUE[act_uuid] = datetime.now()
        self._actions_in_running.mark_changed()

        logger.info(f"start an RPA action {action_doc.act_id} - {action_doc.act_description}")

        return StatefulObjectAndCommitStream(act, self.uipath_action_stream)
Beispiel #3
0
def cs_bert_workflow():
    equity_split_step = ChnEquityInputStep(train_val_split_ratio=0.9)
    train_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.repeat().batch(50)",
                                                                _input_steps=[(equity_split_step, "train")])
    val_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.repeat().batch(50)",
                                                              _input_steps=[(equity_split_step, "validation")])
    eval_ds_with_pip = FinancialStatementCSMaskedTFDatasetStep(ds_pip="lambda ds: ds.batch(10)",
                                                               _input_steps=[(equity_split_step, "evaluate")])
    train_input_steps = [(train_ds_with_pip, "train_ds"), (val_ds_with_pip, "val_ds"), (eval_ds_with_pip, "test_ds")]

    model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
                             model_hp=TSBertForMaskedCS.HP(
                                 name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT_CONST_MASK,
                                 hidden_size=FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT,
                                 num_attention_heads=12))
    train_input_steps += [model_step]
    train_input_steps += [get_compile_step(loss="ts_bert_mae", metrics=["ts_bert_mae", "ts_bert_mse"])]
    train_input_steps += get_recommend_fit_with_callback_steps(epochs=5, steps_per_epoch=700, validation_steps=70)

    train_step = TFTrainStep(_input_steps=train_input_steps)
    # 作为一个标准的 workflow 进行保存
    from gs_research_workflow.samples import workflow_cfg

    sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__),
                                    f"{TSBertForMaskedCS.__name__}_workflow_v1.yml")
    print(sample_file_path)
    save_mapping_to_file_or_stream(sample_file_path, train_step.get_init_value_dict(True), None)
    return train_step
 def test_multi_symbol_ts():
     multi_symbol_ts = SymbolMultipleTSStep(
         data_query_class=cls_to_str(TuShareProData),
         apis_and_columns={
             "equity_basic_daily": ("fin_ind_", ["turnover_rate", "turnover_rate_f",
                                                 "volume_ratio", "pe", "pe_ttm",
                                                 "pb", "ps",
                                                 "ps_ttm", "dv_ratio", "dv_ttm",
                                                 "total_share", "free_share",
                                                 "total_mv", "circ_mv"]),
             "equity_backward_adjust_daily": (
                 "backward_adj_", ["open", "high", "low", "close", "pre_close",
                                   "change", "pct_chg", "vol", "amount"]),
             "equity_moneyflow_daily": ("moneyflow_", ["buy_sm_vol", "buy_sm_amount",
                                                       "sell_sm_vol", "sell_sm_vol",
                                                       "sell_sm_amount",
                                                       "buy_md_vol", "buy_md_amount",
                                                       "sell_md_vol",
                                                       "sell_md_amount",
                                                       "buy_lg_vol",
                                                       "buy_lg_amount",
                                                       "sell_lg_vol",
                                                       "sell_lg_amount",
                                                       "buy_elg_vol",
                                                       "buy_elg_amount",
                                                       "sell_elg_vol",
                                                       "sell_elg_amount",
                                                       "net_mf_vol",
                                                       "net_mf_amount"])
         },
         symbols=["600000.SH", "600050.SH"])
     print(json.dumps(multi_symbol_ts.get_init_value_dict(out_self_cls=True)))
     print(multi_symbol_ts.ts_data[0].info())
     print(multi_symbol_ts.ts_data[0].describe())
     print(multi_symbol_ts.ts_data[0].T)
 def save_compiled_model_args(self, model_full_cfg_file_path: str):
     # NOTE:compile_kwargs 里的 metrics 如果是 Metrics Class 的话,无法序列化成 json 和 pickles
     #  所以这里先暂时假定 compile_kwargs 里面的 metrics 都是 string 类型
     obj_to_dump = (cls_to_str(self.model_cls),
                    self.model_init_hp.get_init_value_dict(True),
                    self.compile_kwargs)
     with open(model_full_cfg_file_path, "w") as f:
         json.dump(obj_to_dump, f)
def get_default_inception_model_classification_task_step(nb_classes: int) -> TFModelStep:
    model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTimeForClassification),
                             model_hp=InceptionTimeForClassification.HP(nb_classes=nb_classes,
                                                                        inception_time_hp=InceptionTimeBlock.HP(
                                                                            depth=6, use_residual=True,
                                                                            inception_block_hp=InceptionBlock.HP(
                                                                                stride=1,
                                                                                use_bottleneck=True))))
    return model_step
Beispiel #7
0
    def cs_financial_statement_model_evaluate():
        from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS
        from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep

        # 显示所有列
        pd.set_option('display.max_columns', None)
        # 显示所有行
        pd.set_option('display.max_rows', None)
        # 设置value的显示长度为100,默认为50
        pd.set_option('max_colwidth', 80)

        # stks = ChnEquityInputStep()
        # tf_ds_step = FinancialStatementCSMaskedTFDatasetStep(df_equities=stks.train_items,
        #                                                      ds_pip="lambda ds: ds.repeat().batch(20)")
        # tf_ds_step._ds_generator_call()
        # for ele in tf_ds_step.tf_ds.take(10):
        #     print(ele)
            # y = model(ele[0])
            # loss = gs_mean_absolute_error(ele[1], y)
            # print(loss)
        symbol = "600315.SH"
        tushare = TuShareProData(use_l3_cache=True)

        df_zscore = equity_all_financial_statement_zscore(tushare, symbol)
        comp_type = equity_comp_type(tushare, symbol)

        df_y_for_pred = df_zscore.iloc[-20:][:]
        df_y_true_original = equity_all_financial_statement_by_enddate(tushare, symbol)[-20:][:]
        input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input(
            df_y_for_pred, comp_type, False, True, False)
        # load model
        # model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
        #                        model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=6, num_hidden_layers=10))
        model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
                               model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=12))

        checkpoint_path = model_hp.check_point_path
        model = TSBertForMaskedCS.from_pre_saved(checkpoint_path)
        # add batch axis
        y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :],
                        attention_mask_id[tf.newaxis, :]))
        np_y_pred = y_pred[0].numpy()[0]  # 去掉 batch 维
        np_y_pred = np_y_pred[1:, 0:df_y_for_pred.shape[1]]  # 去掉 COMP_TYPE 维和 padding 的日期值
        df_y_pred = pd.DataFrame(data=np_y_pred,index=df_y_for_pred.index, columns=df_y_for_pred.columns)

        # de zscore 回原始值
        df_mean, df_std = equity_all_financial_statement_mean_and_std(tushare, symbol)
        df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std)
        # df_y_pred_orig_val = (df_y_for_pred/df_y_for_pred) *df_y_pred_orig_val
        delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]
        delta_percentage = (df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1]

        # print(f"y_true:{df_y_true_original.iloc[-1]}")
        # print(f"y_pred:{df_y_pred_orig_val.iloc[-1]}")
        # print(f"delta_v:{delta_v}")
        print(f"delta_percentage:{delta_percentage.dropna().sort_values(ascending=True)}")
Beispiel #8
0
 def __post_init__(self):
     self._tushare = SDKWrapperContainer.get_sdk_by_cls_name(cls_to_str(TuShareProData),
                                                             {"use_l3_cache": _is_colab_env()})
     df_all_equities = pd.concat([self._tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"]),
                                  self._tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"])])
     self._df_eval = df_all_equities.sample(self.evaluate_items_count, random_state=self.random_state,
                                            axis=0).reset_index(drop=True)
     df_remain = df_all_equities[~df_all_equities["ts_code"].isin(self._df_eval["ts_code"].to_list())]
     self._df_train = df_remain.sample(frac=self.train_val_split_ratio, random_state=self.random_state, axis=0).reset_index(drop=True)
     self._df_val = df_remain[~df_remain["ts_code"].isin(self._df_train["ts_code"].to_list())].reset_index(drop=True)
def get_default_inception_with_attention_model_weight_prediction_task_step() -> TFModelStep:
    model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTimeWithAttentionForWeightPrediction),
                             model_hp=InceptionTimeWithAttentionForWeightPrediction.HP(
                                 inception_attention_hp=InceptionTimeWithAttentionBlock.HP(
                                     depth=6, use_residual=True,
                                     use_attention_at_input=True,
                                     use_attention_at_each_inception=True,
                                     use_attention_after_residual=True,
                                     inception_block_hp=InceptionBlock.HP(stride=1, use_bottleneck=True)
                                 )
                             ))
    return model_step
def run_colab_experiment(template_file_name: str, experiment_name: str,
                         trial_concurrency: int, max_trial_num: int,
                         tuner: Tuner, cls_hp_alias, hps_to_display: List[str], search_space: List[Choice],
                         vm=ExecTrialMachineType.Colab.value,
                         port: int = 8080):
    import os
    cfg_path = f"samples/workflow_cfg/{template_file_name}"  # 从 project root 开始算起的目录
    yml_abs_path = os.path.join(os.path.dirname(__file__), "../../..", cfg_path)
    workflow_cfg, workflow_context = load_mapping_from_file(yml_abs_path)

    experiment = LocalExperiment(authorName="GS_GROUP",
                                 experimentName=experiment_name,
                                 trialConcurrency=trial_concurrency,
                                 maxExecDuration="168h",
                                 tuner=tuner,
                                 maxTrialNum=max_trial_num
                                 )
    experiment.set_trial_module_and_args(gs_research_workflow.auto_ml.nni.hpo.trial_main_pod_side,
                                         cfg=cfg_path,
                                         pool="gs_google_acct_pool_1",
                                         name=experiment_name,
                                         vm=vm,
                                         cfg_alias=cls_to_str(cls_hp_alias))

    if vm == ExecTrialMachineType.Colab.value:
        prepare_gdrive_folder(experiment_name)

    ls_tune_paras = []
    # 以下是一些用于显示的重要 hp 的内容,不参与 tuning ,只保留一个 Choice 项
    # 不参与 tuning 的超参项放在靠前的位置,可以让 nni 的 展示图形更好看一些
    if hps_to_display:
        ls_tune_paras += [cfg_value_to_choice(workflow_cfg, workflow_context, cls_hp_alias, p) for p in
                          hps_to_display]
    if search_space:
        ls_tune_paras += search_space

    exp_search_space = SearchSpace(parameters=ls_tune_paras)
    experiment.set_search_space(exp_search_space)

    print("-" * 50)
    # NOTE: 必须设置 VIRTUAL_ENV 的环境变量,以使得 nnictl 能够正常在 venv 中运行

    nnictl_folder = ""
    if venv_path:
        os.environ["VIRTUAL_ENV"] = venv_path
        nnictl_folder = f"{venv_path}bin/"
    experiment.run(nnictl_folder=nnictl_folder, port=port)
Beispiel #11
0
    def __post_init__(self):
        self._tushare = SDKWrapperContainer.get_sdk_by_cls_name(cls_to_str(TuShareProData),
                                                                {"use_l3_cache": _is_colab_env()})

        self._tf_ds = tf.data.Dataset.from_generator(self._ds_generator_call,
                                                     output_types=(
                                                         (tf.float32, tf.int32, tf.int32, tf.int32),
                                                         tf.float32),
                                                     output_shapes=(
                                                         (
                                                             # input_ids(with mask)
                                                             tf.TensorShape(
                                                                 [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH,
                                                                  FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT]),
                                                             # position_ids
                                                             tf.TensorShape(
                                                                 [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH]),
                                                             # token_ids
                                                             tf.TensorShape(
                                                                 [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH]),
                                                             # attention_mask
                                                             tf.TensorShape(
                                                                 [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH])
                                                         ),

                                                         # y_true
                                                         tf.TensorShape(
                                                             [FinancialStatementCSBertConst.MAX_CS_PERIOD_LENGTH,
                                                              FinancialStatementCSBertConst.FIN_STATEMENT_INDICATORS_COUNT]),
                                                        )
                                                     )
        if self.ds_pip:
            self._tf_ds_with_pip = FuncStrStep(func_body=self.ds_pip, single_input=self._tf_ds).func_result

        self._f_financial_statement = functools.lru_cache(maxsize=2500)(
            functools.partial(equity_all_financial_statement_zscore, tushare_sdk=self._tushare,
                              mean_base_t=None, start_end_period=(date(2008, 1, 1), date(2019, 12, 31)),
                              ret_mean_and_std=True))
        self._f_comp_type = functools.lru_cache(maxsize=2500)(
            functools.partial(equity_comp_type, tushare_sdk=self._tushare))
Beispiel #12
0
    def get_init_value_dict(self,
                            out_self_cls: bool = False) -> Mapping[str, Any]:
        """获取 init 的 dictionay 对象
        Notes : 这里不作为 property , 避免产生一个与 dataclass 定义业务意义无关的 property 内容
        Notes : 递归嵌套的 dict 关系是 lazy 产生的,init 过程中仅保留相关的数据链路关系

        Parameters
        ----------
        out_self_cls:bool
            是否多输出一层当前 class 的内容
        """
        import copy
        from gs_research_workflow.core.gs_step_mapping import GlobalGSStepMapping

        # TODO: 这里需要 composition 的情况
        init_dict_rlt = dict()
        if self._direct_init_field_value:
            for k, v in self._direct_init_field_value.items():
                if isinstance(v, GSStep):
                    init_dict_rlt[k] = v.get_init_value_dict(out_self_cls)
                else:
                    init_dict_rlt[k] = copy.deepcopy(v)
        # init_dict_rlt = copy.deepcopy(self._direct_init_field_value)

        if self._ls_input_steps:
            for curr_step in self._ls_input_steps:
                field_mapping = GlobalGSStepMapping.get_registered(
                    curr_step[0].__class__, self.__class__, curr_step[1])
                # pip 的选项,使用的格式为 "#field1,field2# rule_name
                key = "#" + ",".join(field_mapping.field_names) + "#"
                init_dict_rlt[key] = curr_step[0].get_init_value_dict(True)
                init_dict_rlt[key][_KEY_PROPERTIES] = ",".join(
                    field_mapping.property_names)
                if curr_step[1] is not None:
                    init_dict_rlt[key][_KEY_RULE_NAME] = curr_step[1]

        if out_self_cls:
            return {cls_to_str(self.__class__): init_dict_rlt}
        else:
            return init_dict_rlt
Beispiel #13
0
def create_equity_workflow(req: WorkflowRequest):
    assert req.workflow_name in GSPredefinedWorkflow._value2member_map_

    equity_entity = find_equity(req.entity_str)
    if equity_entity is None:
        wf_batch_uuid = md5_str(
            f"{req.request_from_account}-{req.ctime.isoformat()}-{req.entity_str}"
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=f"Can't find equity symbol or name by '{req.entity_str}'"
        )
        upsert_document(doc_wf, False)
        return

    # 找到了 entity, 生成 workflow 的内容
    wf_batch_uuid = md5_str(
        f"{equity_entity.symbol}-{req.workflow_name}-{req.para_begin}-{req.para_end}-{req.request_from_account}-{req.ctime.isoformat()}"
    )

    # 查询 workflow 预设的更新频率
    # wf_freq = "D"
    wf_freq = "1s"
    workflow_def = PredefinedWorkflow.objects(
        workflow_name=req.workflow_name).first()
    if workflow_def is not None:
        wf_freq = workflow_def.refresh_freq
    # 找一下该 symbol 的 workflow 最近一次的执行时间(假定 Per Symbol + Per Account)
    latest_workflow_inst = TriggeredWebPagesCrawlWorkflow.objects(
        fin_instrument=equity_entity.symbol,
        workflow=req.workflow_name,
        submit_account=req.request_from_account,
        finish_or_error_flag__in=[
            WorkflowStatusFlag.WaitToRun.value,
            WorkflowStatusFlag.SuccessFinished.value
        ]).order_by("-submit_time").first()
    # 如果在同一个周期的,直接记录一条错误的记录内容
    if latest_workflow_inst is not None and is_same_period(
            latest_workflow_inst.submit_time, req.ctime, wf_freq):
        logger.error(
            f"Workflow(uuid={latest_workflow_inst.uuid},ctime='{latest_workflow_inst.submit_time}') in the same period is existed."
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            main_entity_type=EntityType.Equity.value,
            fin_instrument=FinancialInstrumentSymbol(
                symbol=equity_entity.symbol),
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=
            f"workflow '{req.workflow_name}'({equity_entity.symbol}) is executed at {latest_workflow_inst.submit_time} . No need to rerun now."
        )
        upsert_document(doc_wf, False)
        return

    # 创建一个workflow
    doc_wf = TriggeredWebPagesCrawlWorkflow(
        uuid=wf_batch_uuid,
        main_entity_type=EntityType.Equity.value,
        fin_instrument=FinancialInstrumentSymbol(symbol=equity_entity.symbol),
        workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
        para_begin=req.para_begin,
        para_end=req.para_begin,
        submit_account=req.request_from_account,
        submit_type=WorkflowSubmitType.HotKey.value,
        submit_time=req.ctime,
        finish_or_error_flag=WorkflowStatusFlag.WaitToRun.value)
    upsert_document(doc_wf, False)

    # 创建 batch action
    doc_batch_action = RPABatchAction(
        batch_id=wf_batch_uuid,
        is_dynamic_batch=True,
        from_function=cls_to_str(create_equity_workflow),
        ctime=req.ctime,
        status=ActionStatusFlag.WaitingForRun.value)
    upsert_document(doc_batch_action, False)

    # 依次调用 action generator 函数
    # NOTE : 这里是直接访问 diction , 以后改为调用函数,就可以支持 register 的功能
    for func in WORKFLOW_NAME_TO_ACTION_GENERATORS.get(req.workflow_name, []):
        func(equity_entity, wf_batch_uuid)
    logger.info(f"Batch action '{wf_batch_uuid}' is created.")
Beispiel #14
0
def cs_bert_equity_daily_workflow():
    start_t = date(2019, 1, 1)
    end_t = date(2019, 12, 31)

    i_t = IByTGeneratorStep(start_t=start_t, end_t=end_t - timedelta(days=92), sample_freq="2w",
                            train_val_split_ratio=0.95, evaluate_items_count=1,
                            use_concept_blocks=False, ls_i_by_condition=[("low_pe", "pe > 3.0 and pe < 8.0"),
                                                                         ("mid_pe", "pe > 15.0 and pe < 30.0"),
                                                                         ("high_pe", "pe > 30.0 and pe < 80.0"),
                                                                         ("low_pb", "pb >= 0.6 and pb <= 0.8"),
                                                                         ("mid_pb", "pb >= 0.9 and pb <= 1.1"),
                                                                         ("high_pb", "pb >= 1.3 and pb <= 1.8"),
                                                                         ("sml_cap",
                                                                          "total_mv >= 5.0e5 and total_mv < 5.0e6"),
                                                                         ("mid_cap",
                                                                          "total_mv >= 8.0e6 and total_mv < 2.0e7"),
                                                                         ("large_cap", "total_mv >= 2.0e7")
                                                                         ])

    # EquityPoolTSDatasetStep(df_i_by_t=i_t.pool_by_t, i_start_t=start_t, i_end_t=end_t,
    #                         ds_pip="lambda ds: ds.repeat().batch(8)")

    train_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.repeat().batch(5)",
                                                i_start_t=start_t, i_end_t=end_t,
                                                _input_steps=[(i_t, "train")])
    val_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.repeat().batch(5)",
                                              i_start_t=start_t, i_end_t=end_t,
                                              _input_steps=[(i_t, "validation")])
    eval_ds_with_pip = EquityPoolTSDatasetStep(ds_pip="lambda ds: ds.batch(3)",
                                               i_start_t=start_t, i_end_t=end_t,
                                               _input_steps=[(i_t, "evaluate")])

    train_input_steps = [(train_ds_with_pip, "train_ds"), (val_ds_with_pip, "val_ds"), (eval_ds_with_pip, "test_ds")]

    # model = TSBertForMaskedCS(
    #     hp=TSBertForMaskedCS.HP(hidden_size=EquityPoolTSDatasetStep.MAX_INDICATORS,
    #                             # 多一个作为 padding 的0
    #                             max_position_embeddings=EquityPoolTSDatasetStep.LOOK_PERIOD_ITEMS + 1,
    #                             type_vocab_size=EquityPoolTSDatasetStep.MAX_ENTITIES_PER_INST + 1,
    #                             num_attention_heads=12))

    model_name = TSBertName.CHN_EQUITY_DAILY_PREDICT_RETURN_LESS_INDICATORS
    model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
                             model_hp=TSBertForMaskedCS.HP(
                                 name=model_name,
                                 hidden_size=EquityPoolTSDatasetStep.MAX_INDICATORS,
                                 max_position_embeddings=EquityPoolTSDatasetStep.LOOK_PERIOD_ITEMS + 1,
                                 type_vocab_size=EquityPoolTSDatasetStep.MAX_ENTITIES_PER_INST + 1,
                                 num_attention_heads=12)
                             )
    train_input_steps += [model_step]
    train_input_steps += [
        get_compile_step(loss="mae_align_to_y_true", metrics=["mae_align_to_y_true", "mse_align_to_y_true"])]
    # train_input_steps += get_recommend_fit_with_callback_steps(epochs=2, steps_per_epoch=10000, validation_steps=150)
    # train_input_steps += get_recommend_fit_with_callback_steps(epochs=1, steps_per_epoch=14000, validation_steps=200)
    train_input_steps += get_recommend_fit_with_callback_steps(epochs=3, steps_per_epoch=5000, validation_steps=200)
    # train_input_steps += get_recommend_fit_with_callback_steps(epochs=2, steps_per_epoch=20, validation_steps=4)

    train_step = TFTrainStep(_input_steps=train_input_steps)
    # 作为一个标准的 workflow 进行保存
    from gs_research_workflow.samples import workflow_cfg

    sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__),
                                    f"{model_name}_workflow_v1.yml")
    print(sample_file_path)
    save_mapping_to_file_or_stream(sample_file_path, train_step.get_init_value_dict(True), None)
    return train_step
def get_default_ts_bert_for_weight_prediction_task_step() -> TFModelStep:
    model_step = TFModelStep(model_cls_str=cls_to_str(TSBertForWeightPrediction),
                             model_hp=TSBertForWeightPrediction.HP(hidden_size=72)) # hidden_size 必须与 lookback_period 相同
    return model_step
Beispiel #16
0
def for_notebook_eval_cs_financial_statement_mask():
    from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS, TSBertName
    from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep
    from gs_research_workflow.common.serialization_utilities import cls_to_str
    from gs_research_workflow.time_series.data.utilities import de_zscore_to_val
    from gs_research_workflow.common.path_utilities import _DATA_ROOT
    import os
    import sys

    PRINT_HIGHLIGHT_STYLE = "\033[1;37;41m"
    #  ---------- 不同的内容,只需要修改这一部分的参数项  ---------
    model_hp = TFModelStep(
        model_cls_str=cls_to_str(TSBertForMaskedCS),
        model_hp=TSBertForMaskedCS.HP(
            name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT,
            hidden_size=276,
            num_attention_heads=12)
    )  # model hp 这里只能修改 num_attention_heads:[6,12] 和 num_hidden_layers[8,12,16,20]
    # ---------------------------------------------------------

    checkpoint_path = os.path.join(
        _DATA_ROOT, "ModelData", model_hp.model_cls.__name__,
        model_hp.model_init_hp.get_hash_str(
        ))  # 这里不能调用 TFModelStep.check_point_path() , 会创建目录的
    if not os.path.isdir(checkpoint_path):
        print(
            PRINT_HIGHLIGHT_STYLE,
            f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters"
        )
        raise RuntimeError(
            f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters"
        )
    checkpoint_file = os.path.join(checkpoint_path, "tf_model.h5")
    if not os.path.exists(checkpoint_file):
        print(PRINT_HIGHLIGHT_STYLE,
              f"model weight file '{checkpoint_file}' is not existed")
        raise RuntimeError(
            f"model weight file '{checkpoint_file}' is not existed")
    model = TSBertForMaskedCS.from_pre_saved(checkpoint_path)

    # -------------------------------------------------

    # 如果不需要更换 model ,只是换股票的话,只需要调整该 Cell
    symbol = "600315.SH"  # 预测的股票

    # -------------------------------------------------

    # 这部分代码不需要修改,在变更了参数项之后重新执行即可
    # 准备用于展示的数据

    import pandas as pd
    from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData
    from gs_research_workflow.time_series.data.predefined_equity_apis import equity_all_financial_statement_zscore, \
        equity_comp_type, equity_all_financial_statement_mean_and_std, equity_all_financial_statement_by_enddate
    from gs_research_workflow.time_series.gs_steps.tf_ds_for_financial_statement import \
        FinancialStatementCSMaskedTFDatasetStep
    import tensorflow as tf

    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('max_colwidth', 80)

    tushare = TuShareProData(use_l3_cache=True)

    df_zscore, series_mean, series_std = equity_all_financial_statement_zscore(
        tushare, symbol, ret_mean_and_std=True)
    comp_type = equity_comp_type(tushare, symbol)

    df_y_for_pred = df_zscore.iloc[-20:][:]  # 暂时只提供预测已公布数据的最后一期值
    df_y_true_original = equity_all_financial_statement_by_enddate(
        tushare, symbol)[-20:][:]
    input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input(
        df_y_for_pred, comp_type, series_std * 100., False, True, False)

    y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :],
                    token_id[tf.newaxis, :],
                    attention_mask_id[tf.newaxis, :]))  # add batch axis
    np_y_pred = y_pred[0].numpy()[0]  # 去掉 batch 维
    np_y_pred = np_y_pred[
        1:, 0:df_y_for_pred.shape[1]]  # 去掉 COMP_TYPE 维和 padding 的日期值
    df_y_pred = pd.DataFrame(data=np_y_pred,
                             index=df_y_for_pred.index,
                             columns=df_y_for_pred.columns)

    # de zscore 回原始值
    df_mean, df_std = equity_all_financial_statement_mean_and_std(
        tushare, symbol)
    df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std)

    delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]
    delta_percentage = (
        df_y_true_original.iloc[-1] -
        df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1]

    df_pred_summary = pd.DataFrame({
        "true_val": df_y_true_original.iloc[-1],
        "pred_val": df_y_pred_orig_val.iloc[-1]
    }).dropna()
    df_pred_summary[
        "delta_v"] = df_pred_summary["true_val"] - df_pred_summary["pred_val"]
    df_pred_summary["delta_percentage"] = (df_pred_summary["true_val"] - df_pred_summary["pred_val"]) * 100. / \
                                          df_pred_summary["true_val"]

    df_pred_zscore = pd.DataFrame({
        "true_val": df_zscore.iloc[-1],
        "pred_val": df_y_pred.iloc[-1]
    }).dropna()

    print(df_pred_summary)
                                   symbols="000001.SH",
                                   cols=["close"])
    x_ts_data_step = SymbolTSStep(api="equity_backward_adjust_daily",
                                  cols=[
                                      "open", "high", "low", "close",
                                      "pre_close", "change", "pct_chg", "vol",
                                      "amount"
                                  ])

    train_val_tf_ds_step = DELTSCategoryMultiPeriodDatasetStep(_input_steps=[
        train_val_set, (time_align_step,
                        "time_align"), (x_ts_data_step, "x_data_callable")
    ])
    train_ds_step = FuncStrStep(func_body="lambda ds: ds.repeat().batch(10)")
    val_ds_step = FuncStrStep(func_body="lambda ds: ds.repeat().batch(10)")
    model_step = TFModelStep(model_cls_str=cls_to_str(InceptionTime),
                             model_hp=InceptionTime.HP(nb_classes=3))
    compile_step = CompileStep(loss="categorical_crossentropy",
                               optimizer="Adam",
                               metrics=['accuracy'])
    fit_step = FitStep(epochs=10, steps_per_epoch=4500, validation_steps=110)
    checkpoint_step = ModelCheckPointStep(save_best_only=True, verbose=1)
    tensor_board_step = TensorBoardStep(write_graph=False)

    train_step = TFTrainStep(_input_steps=[
        train_val_tf_ds_step, (train_ds_step,
                               "train_pip_line"), (val_ds_step,
                                                   "val_pip_line"), model_step,
        compile_step, fit_step, checkpoint_step, tensor_board_step
    ])
Beispiel #18
0
def category_by_membership_data(workflow_context: Dict = MARKET_CAP_INDEX_MEMBERSHIP_WORKFLOW_DEFAULT_CONTEXT,
                                category_api: str = "index_weight",
                                category_symbol_cols: List[str] = ["con_code"],
                                train_start_t: date = date(2014, 1, 1), train_end_t: date = date(2019, 10, 31),
                                test_start_t: date = date(2019, 11, 1), test_end_t: date = date(2019, 12, 1),
                                train_val_split_ratio: float = 0.85, random_state: Optional[int] = 100) \
        -> Tuple[TSCategoryDatasetPreparingStep, TSCategoryDatasetPreparingStep, TSCategoryDatasetPreparingStep]:
    """
    Returns
    -------
    -
        train_ds_step , val_ds_step , test_ds_step
    """
    assert "LOCAL" in workflow_context
    assert "category_labels" in workflow_context["LOCAL"]
    assert "category_by_index_membership" in workflow_context["LOCAL"]
    assert "x_feature_query_class" in workflow_context["LOCAL"]
    assert "x_features_per_symbol" in workflow_context["LOCAL"]

    all_context_step = {
        k: GetContextStep(k)
        for k in workflow_context["LOCAL"].keys()
    }
    for k, v in all_context_step.items():
        v.SET_CONTEXT(workflow_context)

    kv_convert_step = KeyValueListToMappingStep(
        _input_steps=[(all_context_step["category_labels"], "key_list"),
                      (all_context_step["category_by_index_membership"],
                       "value_list")])

    # training 用到的数据集和 test 用到的数据集,通过不同的时间段进行分开
    # 约有 28K 个数据点
    train_index_membership_ts_step = SymbolTSStep(api=category_api,
                                                  cols=category_symbol_cols,
                                                  _input_steps=[
                                                      (kv_convert_step,
                                                       "symbols")
                                                  ],
                                                  start_t=train_start_t,
                                                  end_t=train_end_t)

    test_index_membership_ts_step = SymbolTSStep(api=category_api,
                                                 cols=category_symbol_cols,
                                                 _input_steps=[
                                                     (kv_convert_step,
                                                      "symbols")
                                                 ],
                                                 start_t=test_start_t,
                                                 end_t=test_end_t)

    train_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat),
                                       _input_steps=[
                                           (train_index_membership_ts_step,
                                            "ts_process")
                                       ])

    test_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat),
                                      _input_steps=[
                                          (test_index_membership_ts_step,
                                           "ts_process")
                                      ])

    train_val_set = TrainValSpiltStep(_input_steps=[(train_concat_df_step,
                                                     "train_val_orig_data")],
                                      split_ratio=train_val_split_ratio,
                                      random_state=random_state)

    # 这里 hardcode 成用上证综指作为 time align 对象,以后做不同市场才考虑将该数据放开
    time_align_step = SymbolTSStep(api="index_quotation_daily",
                                   symbols="000001.SH",
                                   cols=["close"])

    x_orig_data_step = SymbolMultipleTSStep(_input_steps=[(
        all_context_step["x_feature_query_class"], "data_query_class"
    ), (all_context_step["x_features_per_symbol"], "apis_and_columns")])

    x_feature_callable_step = TSPeriodTSByLookbackStep(
        _input_steps=[(x_orig_data_step,
                       "period_ts_callable"), (time_align_step, "time_align")])

    train_ds_with_pip = TSCategoryDatasetPreparingStep(
        export_symbol_in_ds=False,
        export_t_in_ds=False,
        _input_steps=[
            x_feature_callable_step, (train_val_set, "train_set"),
            (all_context_step["category_labels"], "category_labels"),
            (all_context_step["train_val_ds_pip"], "ds_pip")
        ])

    val_ds_with_pip = TSCategoryDatasetPreparingStep(
        export_symbol_in_ds=False,
        export_t_in_ds=False,
        _input_steps=[
            x_feature_callable_step, (train_val_set, "val_set"),
            (all_context_step["category_labels"], "category_labels"),
            (all_context_step["train_val_ds_pip"], "ds_pip")
        ])

    test_ds_with_pip = TSCategoryDatasetPreparingStep(
        export_symbol_in_ds=False,
        export_t_in_ds=False,
        _input_steps=[
            x_feature_callable_step, test_concat_df_step,
            (all_context_step["category_labels"], "category_labels"),
            (all_context_step["test_ds_pip"], "ds_pip")
        ])
    return train_ds_with_pip, val_ds_with_pip, test_ds_with_pip
Beispiel #19
0
from gs_research_workflow.time_series.gs_steps.func_steps import FuncStrStep

from gs_research_workflow.time_series.gs_steps.ts_data_steps import SymbolTSStep, SymbolMultipleTSStep

from gs_research_workflow.time_series.gs_steps.data_structure_utility_steps import KeyValueListToMappingStep

from gs_research_workflow.time_series.gs_steps.local_context_step import GetContextStep

MARKET_CAP_INDEX_MEMBERSHIP_WORKFLOW_DEFAULT_CONTEXT = {
    "LOCAL": {
        "category_labels": ["BigCap", "MidCap", "SmlCap"],
        "category_by_index_membership":
        ["000043.SH", "000044.SH", "000045.SH"],

        # x 的 feature 的数据,来自于哪个 class
        "x_feature_query_class": cls_to_str(TuShareProData),
        # 与股票直接相关的 features
        "x_features_per_symbol": {
            "equity_basic_daily": ("fin_ind_", [
                "turnover_rate", "turnover_rate_f", "volume_ratio", "pe",
                "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio", "dv_ttm",
                "total_share", "free_share", "total_mv", "circ_mv"
            ]),
            "equity_backward_adjust_daily": ("backward_adj_", [
                "open", "high", "low", "close", "pre_close", "change",
                "pct_chg", "vol", "amount"
            ]),
            "equity_moneyflow_daily": ("moneyflow_", [
                "buy_sm_vol", "buy_sm_amount", "sell_sm_vol", "sell_sm_amount",
                "buy_md_vol", "buy_md_amount", "sell_md_vol", "sell_md_amount",
                "buy_lg_vol", "buy_lg_amount", "sell_lg_vol", "sell_lg_amount",
def category_prediction_workflow():
    WORKFLOW_CONTEXT = {
        "LOCAL": {
            "category_labels": ["BigCap", "MidCap", "SmlCap"],
            "category_by_index_membership":
            ["000043.SH", "000044.SH", "000045.SH"],
            "x_feature_from_api":
            "equity_backward_adjust_daily",
            "x_feature_columns": [
                "open", "high", "low", "close", "pre_close", "change",
                "pct_chg", "vol", "amount"
            ],
            "pred_ds_pip":
            "lambda ds: ds.batch(10)",
            "y_start_t":
            date(2019, 11, 1),
            "y_end_t":
            date(2019, 12, 1),
        }
    }
    all_context_step = {
        k: GetContextStep(k)
        for k in WORKFLOW_CONTEXT["LOCAL"].keys()
    }
    for k, v in all_context_step.items():
        v.SET_CONTEXT(WORKFLOW_CONTEXT)

    kv_convert_step = KeyValueListToMappingStep(
        _input_steps=[(all_context_step["category_labels"], "key_list"),
                      (all_context_step["category_by_index_membership"],
                       "value_list")])

    test_index_membership_ts_step = SymbolTSStep(
        api="index_weight",
        cols=["con_code"],
        _input_steps=[(kv_convert_step, "symbols"),
                      (all_context_step["y_start_t"], "start_t"),
                      (all_context_step["y_end_t"], "end_t")])

    time_align_step = SymbolTSStep(api="index_quotation_daily",
                                   symbols="000001.SH",
                                   cols=["close"])

    test_concat_df_step = FuncStrStep(func_obj_str=cls_to_str(dfs_concat),
                                      _input_steps=[
                                          (test_index_membership_ts_step,
                                           "ts_process")
                                      ])

    x_orig_data_step = SymbolTSStep(
        _input_steps=[(all_context_step["x_feature_from_api"],
                       "api"), (all_context_step["x_feature_columns"],
                                "cols")])

    x_feature_callable_step = TSPeriodTSByLookbackStep(
        _input_steps=[(x_orig_data_step,
                       "period_ts_callable"), (time_align_step, "time_align")])

    pred_ds_with_pip = TSCategoryDatasetPreparingStep(
        export_symbol_in_ds=True,
        export_t_in_ds=True,
        _input_steps=[
            x_feature_callable_step, test_concat_df_step,
            (all_context_step["category_labels"], "category_labels"),
            (all_context_step["pred_ds_pip"], "ds_pip")
        ])

    # save test ds pip
    # NOTE: 这里只需要保存 prediction_ds 的workflow 即可,有关 model 可以直接定义在 run 的 env 里
    from gs_research_workflow.samples import workflow_cfg
    sample_file_path = os.path.join(os.path.dirname(workflow_cfg.__file__),
                                    "category_prediction_ds_workflow_v1.yml")
    print(sample_file_path)
    save_mapping_to_file_or_stream(sample_file_path,
                                   pred_ds_with_pip.get_init_value_dict(True),
                                   WORKFLOW_CONTEXT)

    # 这里是 用来验证
    # # UUID="808CCED2DF57AE1BC7030C9B57F9A23A" for debug-73
    model_inst_path = ModelPathGeneratorStep(
        InceptionTime.__name__, "F24E10E3C3C556FC3FDC0C4B18EFA3C5")
    model_with_weight_step = ModelWithWeightSaveLoadStep(
        _input_steps=[model_inst_path])
    df = model_with_weight_step.predict(
        pred_ds_with_pip.tf_ds,
        y_true_col_index=1,
        additional_cols=[
            AdditionalColumnInDS(2, "symbol", TFDSSpecDataCodingType.utf8_str),
            AdditionalColumnInDS(3, "t", TFDSSpecDataCodingType.pd_timestamp)
        ])

    print(df)
# 适用于 function 搭建 pip line
GlobalGSStepMapping.register(
    FuncStrStep,
    FuncStrStep,
    rule_name="single_ret_pip",
    diff_name={FuncStrStep.func_result: FuncStrStep.single_input})

GlobalGSStepMapping.register(
    FuncStrStep,
    FuncStrStep,
    rule_name="args_ret_pip",
    diff_name={FuncStrStep.func_result: FuncStrStep.args})

GlobalGSStepMapping.register(
    FuncStrStep,
    FuncStrStep,
    rule_name="kwargs_ret_pip",
    diff_name={FuncStrStep.func_result: FuncStrStep.kwargs})

reg_fields_from_local_step(FuncStrStep)

if __name__ == "__main__":

    def print_x(x):
        print(x)

    # f_step = FuncStrStep(func_body="lambda x: print(x)")
    f_step = FuncStrStep(func_obj_str=cls_to_str(print_x))
    f_step.func("abc")