Example #1
0
def sync_sw_industry_index(start_t: date, end_t: date, force_reinit: bool = False):
    tushare = TuShareProData(use_l3_cache=False)
    req_freq_controller = TushareReqSleepController(tushare)
    f_equity_quotation_daily = functools.lru_cache(maxsize=2)(tushare.cs_equity_quotation_daily)
    f_equity_basic_daily = functools.lru_cache(maxsize=2)(functools.partial(tushare.cs_equity_basic_daily,
                                                                            cols=["turnover_rate", "turnover_rate_f",
                                                                                  "volume_ratio", "pe", "pe_ttm", "pb",
                                                                                  "ps", "ps_ttm", "dv_ratio", "dv_ttm",
                                                                                  "total_share", "float_share",
                                                                                  "free_share", "total_mv", "circ_mv"]))

    df_sw_index = tushare.index_classify(src="SW")
    df_sw_index = df_sw_index.reset_index(drop=True)
    ls_df_index_equities = []
    for idx_num, row in df_sw_index.iterrows():
        lv = row["level"]
        ind_name = row["industry_name"]
        ind_code = row["index_code"]
        print(
            f"\r[sync_sw_industry_index {idx_num}] Calc {lv} industry '{ind_code}'({ind_name}) index data, reqs {tushare.query_orig_source_count}",
            end="")

        req_freq_controller.begin_internal_check()
        df_ind_index = _calc_sw_industry_data(tushare, ind_code, start_t, end_t, f_equity_quotation_daily,
                                              f_equity_basic_daily)
        req_freq_controller.end_internal_check()
        if df_ind_index is not None:
            tushare.ts_upsert_arctic_storage(TuShareProData.DERIVED_TS_INDUSTRY_INDEX, ind_code, df_ind_index,
                                             force_reinit=force_reinit)
Example #2
0
 def __init__(self, tushare_pro: TuShareProData):
     dict_all_symbol = tushare_pro.stock_basic(
         exchange="SSE").set_index("ts_code").to_dict("index")
     dict_all_symbol.update(
         tushare_pro.stock_basic(
             exchange="SZSE").set_index("ts_code").to_dict("index"))
     self._dict_all_symbols: Dict[str, Dict] = dict_all_symbol
     """ symbol 的详细信息内容"""
Example #3
0
class SampleTushareDataGenerator:
    # 以下这些属于 Hyper Parameter,可以参与到 Tune
    lookback = 120
    predict_count = 5  # 预测之后的N期数据
    feature = ["close", "change", "vol", "amount"]
    target_column = "close"

    # 以下这些一般不参与到 tune
    x_symbols = ["600000.SH", "600028.SH",
                 "600050.SH"]  # 示意用,一般是板块的股票,这个symbol 的序是与 model 强关联的
    y_symbol = "000001.SH"  # 暂时假定在这个类中能够预测的一定是股票

    def __init__(self, start_t: date, end_t: date):
        self.start_t = start_t

        # 从 pkl 读数据是另外一个 class 处理
        # cache 的文件路径将根据 start_t , end_t , FEATURES 做 hash
        # 暂时根据 start / end 确定存盘路径,以后改成只有一份全部数据的 pickle
        self.tushare = TuShareProData()
        df_all_x = [
            self.tushare.equity_quotation_daily(x,
                                                start=start_t,
                                                end=end_t,
                                                cols=self.feature)
            for x in self.x_symbols
        ]
        df_y = self.tushare.index_quotation_daily(self.y_symbol,
                                                  start=start_t,
                                                  end=end_t,
                                                  cols=[self.target_column])
        self.df_all_data = df_y
        for x_symbol, x_df in zip(self.x_symbols, df_all_x):
            self.df_all_data = self.df_all_data.join(x_df,
                                                     rsuffix=f"_{x_symbol}")
        self.df_all_data.fillna(method="ffill", inplace=True)

    def x_shape(self) -> tf.TensorShape:
        return tf.TensorShape(
            [self.lookback,
             len(self.x_symbols) * len(self.feature)]).as_list()

    def __call__(self):

        for i in range(
                len(self.df_all_data) - self.lookback - self.predict_count -
                1):
            yield (
                self.df_all_data.iloc[i:i + self.lookback,
                                      1:].to_numpy(),  # [9, lookback]
                self.df_all_data[self.df_all_data.columns[0]]
                [i:i + self.predict_count].to_numpy())
Example #4
0
def upsert_chn_stock_info_in_yahoo():
    set_http_proxy()
    tushare = TuShareProData()
    df = tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"].replace("SH", "SS")  # YAHOO 上海股市的后缀是 SS
        logger.info(f"[{idx}] {symbol}({row['name']}) symbol data in yahoo")
        upsert_yahoo_financial_instrument_info(symbol, True)

    df = tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"]  # 深圳股市的后缀是相同的
        logger.info(f"[{idx}] {symbol}({row['name']}) symbol data in yahoo")
        upsert_yahoo_financial_instrument_info(symbol, True)
 def get_sdk(cls, wrapper_def: TSSDKWrapper, use_l3_cache: bool) -> ArcticAndLocalCacheBySymbol:
     cache_key = f"{wrapper_def.data_source}-{wrapper_def.data_source_auth}-{use_l3_cache}"
     if cache_key in cls._all_sdk_wrapper:
         return cls._all_sdk_wrapper[cache_key]
     if wrapper_def.data_source == "tushare_pro":
         from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData
         if wrapper_def.data_source_auth is None:
             _sdk = TuShareProData(use_l3_cache=use_l3_cache)
         else:
             _sdk = TuShareProData(wrapper_def.data_source_auth, use_l3_cache=use_l3_cache)
         cls._all_sdk_wrapper[cache_key] = _sdk
         return _sdk
     else:
         raise NotImplementedError
Example #6
0
def upsert_chn_stock_name():
    tushare = TuShareProData()
    df = tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"].replace("SH", "SS")  # YAHOO 上海股市的后缀是 SS
        symbol_obj = FinancialInstrumentSymbol(symbol=symbol,
                                               full_name=row['name'])
        upsert_document(symbol_obj, False)

    df = tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"]  # 深圳股市的后缀是相同的
        symbol_obj = FinancialInstrumentSymbol(symbol=symbol,
                                               full_name=row['name'])
        upsert_document(symbol_obj, False)
Example #7
0
def run_category_prediction(model_name: str, model_inst_gid: str, pred_ds_cfg_path: str, pred_name: str) -> pd.DataFrame:
    """ load model 并且执行 Prediction 的操作 """
    assert os.path.isfile(pred_ds_cfg_path), logger.error(f" cfg file {pred_ds_cfg_path} is not existed!")
    pred_ds_workflow_cfg, pred_ds_workflow_context = load_mapping_from_file(pred_ds_cfg_path)
    pred_ds = create_step_by_dict(pred_ds_workflow_cfg, pred_ds_workflow_context)

    model_inst_path = ModelPathGeneratorStep(model_name, model_inst_gid)
    model_with_weight_step = ModelWithWeightSaveLoadStep(_input_steps=[model_inst_path])
    # NOTE: 这里 predict 的 Input parameter 暂时先 hardcode, 以后考虑做到 Prediction 的 workflow 的其他参数项中
    df = model_with_weight_step.predict(pred_ds.tf_ds, y_true_col_index=1,
                                        additional_cols=[
                                            AdditionalColumnInDS(2, "symbol", TFDSSpecDataCodingType.utf8_str),
                                            AdditionalColumnInDS(3, "t", TFDSSpecDataCodingType.pd_timestamp)])

    #叠加 额外的辅助列
    from gs_research_workflow.common.path_utilities import _is_colab_env
    tushare = TuShareProData(use_l3_cache=_is_colab_env())
    symbol_info_lookup = TushareSymbolToName(tushare)
    df["symbol_name"] = df.apply(partial(symbol_info_lookup, "name", "symbol"), axis=1)

    cat_label_mapping = CategoryIntToLabel(pred_ds)
    df["y_true_label"] = df.apply(partial(cat_label_mapping, "y_true"), axis=1)
    df["y_pred_label"] = df.apply(partial(cat_label_mapping, "y_pred"), axis=1)

    csv_path = os.path.join(get_prediction_output_path(model_name, pred_name),
                            datetime.now().strftime("%Y%m%d_%H%M%S") + ".csv")

    df.to_csv(csv_path, header=True, index=True, encoding="utf-8-sig", quotechar="\"")
    return df
Example #8
0
    def prepare_index_portfolio_data():
        import tushare as ts
        import time
        import yaml

        ts_wrapper = TuShareProData(use_l3_cache=True)

        # 可以使用的 market : "SSE" / "SZSE" / "CSI"

        # 没有 portfolio 信息的 market : "CICC" / "MSCI" / "SW" / "OTH"

        ls_index_has_weight_data = []
        mkt_code = "OTH"
        dict_valid_index_code: Dict[str, Tuple[str, int]] = dict(
        )  # key : symbol code , value : (index_name,portfolio_count)
        yml_file_path = os.path.join("/tmp", f"{mkt_code}_index_member.yml")
        if os.path.isfile(yml_file_path):
            with open(yml_file_path, "r") as yaml_file:
                dict_valid_index_code = yaml.load(yaml_file)

        df_all_index = ts_wrapper.index_basic(market=mkt_code)
        print(f"{len(df_all_index)} indexes to query portfolio")
        for id_num, row in df_all_index.iterrows():
            idx_code = row["ts_code"]
            if idx_code in dict_valid_index_code:
                continue

            df_index_member = ts_wrapper.index_weight(symbol=idx_code)
            if len(df_index_member) > 0:
                ls_index_has_weight_data.append(idx_code)
                print(
                    f"index[{id_num}]:{idx_code}-{row['name']}-{len(df_index_member)}"
                )
            else:
                if (id_num - len(ls_index_has_weight_data) + 1) % 5 == 0:
                    print(
                        f"sleep for empty data {id_num} , count {id_num - len(ls_index_has_weight_data) + 1}"
                    )
                    time.sleep(5)
            dict_valid_index_code[idx_code] = (row['name'],
                                               len(df_index_member))
            # 先简化代码,每次都 dump 一下数据
            with open(yml_file_path, "w") as yaml_file:
                yaml.dump(dict_valid_index_code, yaml_file)

        print(f"total have index {len(ls_index_has_weight_data)}")
        print(ls_index_has_weight_data)
Example #9
0
    def cs_financial_statement_model_evaluate():
        from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS
        from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep

        # 显示所有列
        pd.set_option('display.max_columns', None)
        # 显示所有行
        pd.set_option('display.max_rows', None)
        # 设置value的显示长度为100,默认为50
        pd.set_option('max_colwidth', 80)

        # stks = ChnEquityInputStep()
        # tf_ds_step = FinancialStatementCSMaskedTFDatasetStep(df_equities=stks.train_items,
        #                                                      ds_pip="lambda ds: ds.repeat().batch(20)")
        # tf_ds_step._ds_generator_call()
        # for ele in tf_ds_step.tf_ds.take(10):
        #     print(ele)
            # y = model(ele[0])
            # loss = gs_mean_absolute_error(ele[1], y)
            # print(loss)
        symbol = "600315.SH"
        tushare = TuShareProData(use_l3_cache=True)

        df_zscore = equity_all_financial_statement_zscore(tushare, symbol)
        comp_type = equity_comp_type(tushare, symbol)

        df_y_for_pred = df_zscore.iloc[-20:][:]
        df_y_true_original = equity_all_financial_statement_by_enddate(tushare, symbol)[-20:][:]
        input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input(
            df_y_for_pred, comp_type, False, True, False)
        # load model
        # model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
        #                        model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=6, num_hidden_layers=10))
        model_hp = TFModelStep(model_cls_str=cls_to_str(TSBertForMaskedCS),
                               model_hp=TSBertForMaskedCS.HP(hidden_size=276, num_attention_heads=12))

        checkpoint_path = model_hp.check_point_path
        model = TSBertForMaskedCS.from_pre_saved(checkpoint_path)
        # add batch axis
        y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :], token_id[tf.newaxis, :],
                        attention_mask_id[tf.newaxis, :]))
        np_y_pred = y_pred[0].numpy()[0]  # 去掉 batch 维
        np_y_pred = np_y_pred[1:, 0:df_y_for_pred.shape[1]]  # 去掉 COMP_TYPE 维和 padding 的日期值
        df_y_pred = pd.DataFrame(data=np_y_pred,index=df_y_for_pred.index, columns=df_y_for_pred.columns)

        # de zscore 回原始值
        df_mean, df_std = equity_all_financial_statement_mean_and_std(tushare, symbol)
        df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std)
        # df_y_pred_orig_val = (df_y_for_pred/df_y_for_pred) *df_y_pred_orig_val
        delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]
        delta_percentage = (df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1]

        # print(f"y_true:{df_y_true_original.iloc[-1]}")
        # print(f"y_pred:{df_y_pred_orig_val.iloc[-1]}")
        # print(f"delta_v:{delta_v}")
        print(f"delta_percentage:{delta_percentage.dropna().sort_values(ascending=True)}")
def equity_comp_type(tushare_sdk: TuShareProData, symbol: str) -> int:
    """获取公司的类型,  1一般工商业 2银行 3保险 4证券 """
    df_income = tushare_sdk.income(symbol=symbol, report_type=1)
    comp_type = df_income["comp_type"].unique()
    rlt = 1
    try:
        rlt = int(comp_type[0])
    except:
        pass
    return rlt
Example #11
0
def sync_equity_financial_statement(mkt_code: str = "SSE"):
    """上市公司的财务数据等"""
    os.environ["local_cache_expire_hours"] = "24"

    ts_wrapper = TuShareProData(use_l3_cache=False)
    df_stks = ts_wrapper.stock_basic(exchange=mkt_code, cols=["ts_code", "name"])
    req_freq_controller = TushareReqSleepController(ts_wrapper)

    for id_num, row in df_stks.iterrows():
        print(f"\rsync_equity_financial_statement {id_num} : {row['ts_code']} - {row['name']} , total reqs {ts_wrapper.query_orig_source_count} ", end="")
        symbol = row["ts_code"]
        req_freq_controller.begin_internal_check()
        ts_wrapper.income_by_enddate(symbol=symbol, to_single_period_val=True)
        ts_wrapper.balancesheet_by_enddate(symbol=symbol)
        ts_wrapper.cashflow_by_enddate(symbol=symbol, to_single_period_val=True)
        req_freq_controller.end_internal_check()
Example #12
0
def sync_equity_to_sw_industry(sw_industry_lv: str, start_t: date, end_t: date, force_reinit: bool = False):
    lv_to_lib_name = {"L1": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L1,
                      "L2": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L2,
                      "L3": TuShareProData.DERIVED_TS_EQUITY_SW_INDUSTRY_L3}
    assert sw_industry_lv in lv_to_lib_name.keys()

    tushare = TuShareProData(use_l3_cache=False)
    req_freq_controller = TushareReqSleepController(tushare)

    df_sw_index = tushare.index_classify(level=sw_industry_lv, src="SW")
    df_sw_index = df_sw_index.reset_index(drop=True)
    ls_df_index_equities = []
    i_num = 0
    for idx_num, row in df_sw_index.iterrows():
        i_num += 1
        lv = row["level"]
        ind_name = row["industry_name"]
        ind_code = row["index_code"]

        req_freq_controller.begin_internal_check()
        df_index_equities = tushare.period_index_member(index_code=ind_code, resample_freq="B", start=start_t,
                                                        end=end_t)
        req_freq_controller.end_internal_check()

        df_index_equities["index_code"] = ind_code
        ls_df_index_equities.append(df_index_equities)

    df_all_index_equities = pd.concat(ls_df_index_equities)
    for i, symbol in enumerate(df_all_index_equities["symbol"].unique()):
        df_symbol_in_industry = df_all_index_equities[df_all_index_equities["symbol"] == symbol]
        df_symbol_in_industry.drop(columns=["symbol"], inplace=True)
        print(f"\rsync_equity_to_sw_industry {i}:{symbol}-{df_symbol_in_industry.index.min()}-{df_symbol_in_industry.index.max()} ", end="")
        tushare.ts_upsert_arctic_storage(lv_to_lib_name[sw_industry_lv], symbol, df_symbol_in_industry,
                                         force_reinit=force_reinit)
def equity_all_financial_statement_by_enddate(
        tushare_sdk: TuShareProData,
        symbol: str,
        start_end_period: Tuple[Optional[date],
                                Optional[date]] = (date(2008, 1,
                                                        1), date(2019, 12,
                                                                 31)),
        to_single_period: bool = True) -> pd.DataFrame:
    df_income = tushare_sdk.income_by_enddate(
        symbol=symbol, to_single_period_val=to_single_period)
    df_balance_sheet = tushare_sdk.balancesheet_by_enddate(symbol=symbol)
    df_cashflow = tushare_sdk.cashflow_by_enddate(
        symbol=symbol, to_single_period_val=to_single_period)
    df_all = df_income.join(df_balance_sheet,
                            how="left",
                            lsuffix="_inc",
                            rsuffix="_bs").join(df_cashflow,
                                                how="left",
                                                rsuffix="_cf")
    start, end = start_end_period
    df_all = _filter_df_by_start_end(df_all, start, end)
    return df_all
Example #14
0
    def __init__(self, start_t: date, end_t: date):
        self.start_t = start_t

        # 从 pkl 读数据是另外一个 class 处理
        # cache 的文件路径将根据 start_t , end_t , FEATURES 做 hash
        # 暂时根据 start / end 确定存盘路径,以后改成只有一份全部数据的 pickle
        self.tushare = TuShareProData()
        df_all_x = [
            self.tushare.equity_quotation_daily(x,
                                                start=start_t,
                                                end=end_t,
                                                cols=self.feature)
            for x in self.x_symbols
        ]
        df_y = self.tushare.index_quotation_daily(self.y_symbol,
                                                  start=start_t,
                                                  end=end_t,
                                                  cols=[self.target_column])
        self.df_all_data = df_y
        for x_symbol, x_df in zip(self.x_symbols, df_all_x):
            self.df_all_data = self.df_all_data.join(x_df,
                                                     rsuffix=f"_{x_symbol}")
        self.df_all_data.fillna(method="ffill", inplace=True)
Example #15
0
def sync_equity_cs_daily(start_t: date):
    os.environ["local_cache_expire_hours"] = "12"
    tushare = TuShareProData(use_l3_cache=False)
    df_index = tushare.index_quotation_daily("000001.SH", start=start_t, cols=["close"])
    req_freq_controller = TushareReqSleepController(tushare)

    for idx_date, row in df_index.iterrows():
        cs_t = date(idx_date.year, idx_date.month, idx_date.day)
        # 先只获取两个日期的数据
        print(f"\rEquity CS data @{cs_t} , total reqs {tushare.query_orig_source_count} ", end="")

        req_freq_controller.begin_internal_check()
        tushare.cs_equity_quotation_daily(start=cs_t, end=cs_t)
        tushare.cs_equity_basic_daily(start=cs_t, end=cs_t)
        req_freq_controller.end_internal_check()
Example #16
0
def _calc_sw_industry_data(tushare: TuShareProData, sw_industry_code: str, start_t: date, end_t: date,
                           f_equity_quotation_daily=None, f_equity_basic_daily=None) -> pd.DataFrame:
    # 行业指数在 date 的持仓
    df_stocks_in_industry = tushare.period_index_member(sw_industry_code, start=start_t, end=end_t,
                                                        resample_freq="B")
    if df_stocks_in_industry is None:  # 区间内没有股票,直接返回
        return
    df_stocks_in_industry = df_stocks_in_industry.reset_index().set_index(keys=["date", "symbol"])

    if f_equity_quotation_daily is None:
        f_equity_quotation_daily = tushare.cs_equity_quotation_daily
    if f_equity_basic_daily is None:
        f_equity_basic_daily = functools.partial(tushare.cs_equity_basic_daily,
                                                 cols=["turnover_rate", "turnover_rate_f",
                                                       "volume_ratio", "pe", "pe_ttm", "pb",
                                                       "ps", "ps_ttm", "dv_ratio", "dv_ttm",
                                                       "total_share", "float_share",
                                                       "free_share", "total_mv", "circ_mv"])
    # 每日指标
    mv_weighted_cols = ["open", "high", "low", "close", "pre_close", "change", "pct_chg", "turnover_rate",
                        "turnover_rate_f", "volume_ratio", "pe", "pe_ttm", "pb", "ps", "ps_ttm", "dv_ratio",
                        "dv_ttm"]  # 总市值加权的列
    # sum_cols = ["vol", "amount", "total_share", "float_share", "free_share", "total_mv", "circ_mv"]
    # 得到个股的区间数据
    df_cs_daily_industry_membership = df_stocks_in_industry.join(
        f_equity_quotation_daily(start=start_t, end=end_t), how="left").join(
        f_equity_basic_daily(start=start_t, end=end_t), how="left")
    # 市值加权列的加总计算
    for col in mv_weighted_cols:
        df_cs_daily_industry_membership[f"_{col}"] = df_cs_daily_industry_membership[col] * \
                                                     df_cs_daily_industry_membership["total_mv"]
    df_cs_daily_industry_membership.drop(columns=mv_weighted_cols, inplace=True)
    df_cs_daily_industry_membership.rename(columns={f"_{col}": col for col in mv_weighted_cols}, inplace=True)

    df_cs_daily_industry_membership = df_cs_daily_industry_membership.reset_index().groupby("date").sum()
    for col in mv_weighted_cols:
        df_cs_daily_industry_membership[f"_{col}"] = df_cs_daily_industry_membership[col] / \
                                                     df_cs_daily_industry_membership["total_mv"]
    df_cs_daily_industry_membership.drop(columns=mv_weighted_cols, inplace=True)
    df_cs_daily_industry_membership.rename(columns={f"_{col}": col for col in mv_weighted_cols}, inplace=True)
    return df_cs_daily_industry_membership
Example #17
0
    def prepare_fund_portfolio_data():
        import tushare as ts
        import yaml
        import time
        ts_pro = ts.pro_api(
            "8fe0d951588bf9b605de2cdce4a7b35a61c79ed3c6e128302dcca142")
        ts_wrapper = TuShareProData(use_l3_cache=True)

        # print(ts_wrapper.fund_nav("001753.OF", re_init=True))

        # print(ts_wrapper.cs_fund_nav(end=date(2020, 1, 23), look_period=1))

        # print(ts_pro.fund_nav(ts_code="001753.OF"))  # 公募基金的累计净值,可以作为 x_feature 的 by value 部分
        # print(ts_pro.fund_portfolio(ts_code="008140.OF")) # 公募基金的持仓作为 membership 的数据
        # df = ts_pro.fund_basic(market="O")

        # --- 所关注的开放式基金列表 ----
        # df = ts_wrapper.fund_basic(market="E")
        # df = df[~df["invest_type"].str.contains("货币型|黄金现货|债券型|原油主题|期货型")]
        # print(df["invest_type"].unique())
        # print(df)

        # print(ts_wrapper.fund_portfolio("515070.SH"))
        # -------

        # ---- 遍历有持仓数据的基金列表 ------
        ls_fund_has_weight_data = []
        mkt_code = "Fund_E"
        dict_valid_fund_code: Dict[str, Tuple[str, int]] = dict(
        )  # key : symbol code , value : (index_name,portfolio_count)
        yml_file_path = os.path.join("/tmp", f"{mkt_code}_portfolio.yml")
        if os.path.isfile(yml_file_path):
            with open(yml_file_path, "r") as yaml_file:
                dict_valid_fund_code = yaml.load(yaml_file)

        df_all_fund = ts_wrapper.fund_basic(market="E")
        df_valid_fund = df_all_fund[~df_all_fund["invest_type"].str.
                                    contains("货币型|黄金现货|债券型|原油主题|期货型")]

        print(f"{len(df_valid_fund)} funds to query portfolio")
        for id_num, row in df_valid_fund.iterrows():
            fund_code = row["ts_code"]
            if fund_code in dict_valid_fund_code:
                continue

            df_fund_portfolio = ts_wrapper.fund_portfolio(symbol=fund_code)
            if len(df_fund_portfolio) > 0:
                ls_fund_has_weight_data.append(fund_code)
                print(
                    f"fund [{id_num}]:{fund_code}-{row['name']}-{len(df_fund_portfolio)}"
                )
            else:
                if (id_num - len(ls_fund_has_weight_data) + 1) % 5 == 0:
                    print(
                        f"sleep for empty data {id_num} , count {id_num - len(ls_fund_has_weight_data) + 1}"
                    )
                    time.sleep(5)
            dict_valid_fund_code[fund_code] = (row['name'],
                                               len(df_fund_portfolio))
            # 先简化代码,每次都 dump 一下数据
            with open(yml_file_path, "w") as yaml_file:
                yaml.dump(dict_valid_fund_code, yaml_file)
# -*- coding: UTF-8 -*-
from datetime import date

import pandas as pd
import tushare as ts

from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData
from gs_research_workflow.time_series.data.utilities import val_convert_to_zscore

ts_pro = ts.pro_api("8fe0d951588bf9b605de2cdce4a7b35a61c79ed3c6e128302dcca142")
gs_ts_pro = TuShareProData(use_l3_cache=True)

# 带有 t 的 category 的格式应该为:(date , category , Set[symbol])
#
#
# 方法一: 先只传 symbol , 在 loop 的时候 random 出几个 属于某个 category 的类

def test_equity_concept():
    df_j1 = gs_ts_pro.cs_equity_quotation_daily(start=None, end=date(2019, 12, 31), look_period=10)
    df_j2 = gs_ts_pro.cs_equity_adj_factor(start=None, end=date(2019, 12, 31), look_period=10)
    dfI = gs_ts_pro.cs_equity_basic_daily(start=None, end=date(2019, 12, 31), look_period=10, cols=["pe"])
    # df = gs_ts_pro.cs_equity_moneyflow(start=None, end=date(2019, 12, 31), look_period=10)
    # df = gs_ts_pro.cs_equity_margin_detail(start=None, end=date(2019, 12, 31), look_period=10)
    # df = gs_ts_pro.cs_equity_block_trade(start=None, end=date(2019, 12, 31), look_period=10)
    # df = gs_ts_pro.cs_equity_top_inst(start=None, end=date(2019, 12, 31), look_period=10)
    # print(df_j1)
    # print(df_j2)
    dfI = dfI[(dfI["pe"] < 20.0) & (dfI["pe"] > 10.0)]
    print(dfI)
    df_j = pd.DataFrame({"adj_open": df_j1["open"] * df_j2["adj_factor"],
                         "adj_close": df_j1["close"] * df_j2["adj_factor"]
Example #19
0
 def __post_init__(self):
     # todo: init tushare query object
     self._tushare = TuShareProData()
Example #20
0
def for_notebook_eval_cs_financial_statement_mask():
    from gs_research_workflow.time_series.models.ts_bert import TSBertForMaskedCS, TSBertName
    from gs_research_workflow.time_series.gs_steps.model_steps import TFModelStep
    from gs_research_workflow.common.serialization_utilities import cls_to_str
    from gs_research_workflow.time_series.data.utilities import de_zscore_to_val
    from gs_research_workflow.common.path_utilities import _DATA_ROOT
    import os
    import sys

    PRINT_HIGHLIGHT_STYLE = "\033[1;37;41m"
    #  ---------- 不同的内容,只需要修改这一部分的参数项  ---------
    model_hp = TFModelStep(
        model_cls_str=cls_to_str(TSBertForMaskedCS),
        model_hp=TSBertForMaskedCS.HP(
            name=TSBertName.CHN_EQUITY_FINANCIAL_STATEMENT,
            hidden_size=276,
            num_attention_heads=12)
    )  # model hp 这里只能修改 num_attention_heads:[6,12] 和 num_hidden_layers[8,12,16,20]
    # ---------------------------------------------------------

    checkpoint_path = os.path.join(
        _DATA_ROOT, "ModelData", model_hp.model_cls.__name__,
        model_hp.model_init_hp.get_hash_str(
        ))  # 这里不能调用 TFModelStep.check_point_path() , 会创建目录的
    if not os.path.isdir(checkpoint_path):
        print(
            PRINT_HIGHLIGHT_STYLE,
            f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters"
        )
        raise RuntimeError(
            f"model path '{checkpoint_path}' is not existed! please check the model hyper-parameters"
        )
    checkpoint_file = os.path.join(checkpoint_path, "tf_model.h5")
    if not os.path.exists(checkpoint_file):
        print(PRINT_HIGHLIGHT_STYLE,
              f"model weight file '{checkpoint_file}' is not existed")
        raise RuntimeError(
            f"model weight file '{checkpoint_file}' is not existed")
    model = TSBertForMaskedCS.from_pre_saved(checkpoint_path)

    # -------------------------------------------------

    # 如果不需要更换 model ,只是换股票的话,只需要调整该 Cell
    symbol = "600315.SH"  # 预测的股票

    # -------------------------------------------------

    # 这部分代码不需要修改,在变更了参数项之后重新执行即可
    # 准备用于展示的数据

    import pandas as pd
    from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData
    from gs_research_workflow.time_series.data.predefined_equity_apis import equity_all_financial_statement_zscore, \
        equity_comp_type, equity_all_financial_statement_mean_and_std, equity_all_financial_statement_by_enddate
    from gs_research_workflow.time_series.gs_steps.tf_ds_for_financial_statement import \
        FinancialStatementCSMaskedTFDatasetStep
    import tensorflow as tf

    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('max_colwidth', 80)

    tushare = TuShareProData(use_l3_cache=True)

    df_zscore, series_mean, series_std = equity_all_financial_statement_zscore(
        tushare, symbol, ret_mean_and_std=True)
    comp_type = equity_comp_type(tushare, symbol)

    df_y_for_pred = df_zscore.iloc[-20:][:]  # 暂时只提供预测已公布数据的最后一期值
    df_y_true_original = equity_all_financial_statement_by_enddate(
        tushare, symbol)[-20:][:]
    input_ids, position_id, token_id, attention_mask_id = FinancialStatementCSMaskedTFDatasetStep.df_to_model_input(
        df_y_for_pred, comp_type, series_std * 100., False, True, False)

    y_pred = model((input_ids[tf.newaxis, :], position_id[tf.newaxis, :],
                    token_id[tf.newaxis, :],
                    attention_mask_id[tf.newaxis, :]))  # add batch axis
    np_y_pred = y_pred[0].numpy()[0]  # 去掉 batch 维
    np_y_pred = np_y_pred[
        1:, 0:df_y_for_pred.shape[1]]  # 去掉 COMP_TYPE 维和 padding 的日期值
    df_y_pred = pd.DataFrame(data=np_y_pred,
                             index=df_y_for_pred.index,
                             columns=df_y_for_pred.columns)

    # de zscore 回原始值
    df_mean, df_std = equity_all_financial_statement_mean_and_std(
        tushare, symbol)
    df_y_pred_orig_val = de_zscore_to_val(df_y_pred, df_mean, df_std)

    delta_v = df_y_true_original.iloc[-1] - df_y_pred_orig_val.iloc[-1]
    delta_percentage = (
        df_y_true_original.iloc[-1] -
        df_y_pred_orig_val.iloc[-1]) / df_y_true_original.iloc[-1]

    df_pred_summary = pd.DataFrame({
        "true_val": df_y_true_original.iloc[-1],
        "pred_val": df_y_pred_orig_val.iloc[-1]
    }).dropna()
    df_pred_summary[
        "delta_v"] = df_pred_summary["true_val"] - df_pred_summary["pred_val"]
    df_pred_summary["delta_percentage"] = (df_pred_summary["true_val"] - df_pred_summary["pred_val"]) * 100. / \
                                          df_pred_summary["true_val"]

    df_pred_zscore = pd.DataFrame({
        "true_val": df_zscore.iloc[-1],
        "pred_val": df_y_pred.iloc[-1]
    }).dropna()

    print(df_pred_summary)
Example #21
0
            if self._y_columns_count == 1:
                yield (self._x_df_data.iloc[i:i +
                                            self.x_loopback, :].to_numpy(),
                       self._y_df_data[self._y_df_data.columns[0]]
                       [i:i + self.y_predict_count].to_numpy())
            else:
                yield (
                    self._x_df_data.iloc[i:i + self.x_loopback, :].to_numpy(),
                    self._y_df_data.iloc[i:i +
                                         self.y_predict_count, :].to_numpy())


if __name__ == "__main__":
    from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData

    tushare = TuShareProData()

    gen = TSXYGenerator(
        x_symbols=["600000.SH", "600028.SH", "600050.SH"],
        y_symbol="000001.SH",
        x_get_data_objs=[
            TSCallableData(tushare.equity_quotation_daily,
                           ["close", "change", "vol", "amount"], "{symbol}_"),
            TSCallableData(tushare.equity_basic_daily,
                           ["turnover_rate", "pe", "pb"], "{symbol}_")
        ],
        y_get_data_objs=[
            TSCallableData(tushare.index_quotation_daily, ["close"],
                           "{symbol}_")
        ],
        start_t=date(2010, 1, 1),
        for symbol, s_category in self._all_samples:
            df = self._loop_get_data_and_join(symbol, self.x_get_data_objs, self.start_t, self.end_t)
            df = self.df_time_align.join(df)
            if self.f_fill_na:
                df = self.f_fill_na(df)
            # yield df[df.columns[1:]].to_numpy(), np.int(self.category_to_num(s_category))
            yield df[df.columns[1:]].to_numpy(), np.array([int(self.category_to_num(s_category))])


if __name__ == "__main__":
    from gs_research_workflow.time_series.data.tushare_wrapper import TuShareProData, get_category_symbols, \
        get_all_market_symbols
    import tensorflow as tf
    from tensorflow import keras

    tushare = TuShareProData()
    category_vocabs = ["密集调研", "高股息", "白马股", "养老金持股", "混改", "次新股", "回购", "兜底式增持", "筹码集中", "高送转",
                   "社保重仓", "业绩预增", "股权争夺", "破发次新", "举牌股"]
    x_symbols_with_category, all_symbols_with_category = get_category_symbols(tushare, category_vocabs)
    all_stock_symbols = get_all_market_symbols(tushare)
    all_symbols_without_category = all_stock_symbols - all_symbols_with_category

    np.random.shuffle(x_symbols_with_category)
    split_ratio = 0.8
    split_pos = int(len(x_symbols_with_category) * split_ratio)
    train_set, val_set = x_symbols_with_category[:split_pos], x_symbols_with_category[split_pos:]

    start_t = date(2018, 10, 1)
    end_t = date(2019, 10, 28)

    gen_kwargs = dict(x_symbols_with_category=train_set,