def QA_ts_update_namechange(): """ 保存所有股票的历史曾用名 """ # 建表 coll = DATABASE.namechange coll.create_index( [ ("code", ASCENDING), ("start_date_stamp", ASCENDING), ("end_date_stamp", ASCENDING), ("ann_date_stamp", ASCENDING), ], unique=True, ) # 初始化数据接口 pro = get_pro() # 获取历史所有股票 symbol_list = sorted( list(set(QA_fmt_code_list(QA_fetch_stock_basic().index.tolist(), "ts")))) df = pd.DataFrame() for i, symbol in enumerate(symbol_list): if i % 100 == 0: print(f"Saving {i}th stock name, stock is {symbol}") try: df = df.append(pro.namechange(ts_code=symbol)) except: time.sleep(1) try: df = df.append(pro.namechange(ts_code=symbol)) except: raise ValueError("[ERROR]\t数据获取失败") # df.to_csv("test.csv") df["code"] = QA_fmt_code_list(df["ts_code"]) df["start_date_stamp"] = df["start_date"].apply(QA_util_date_stamp) df["end_date_stamp"] = df["end_date"].apply(QA_util_date_stamp) df["ann_date_stamp"] = df["ann_date"].apply(QA_util_date_stamp) df = df.where(pd.notnull(df), None) js = QA_util_to_json_from_pandas(df.drop(columns=["ts_code"])) for item in js: if not item["end_date"]: item["end_date_stamp"] = 9999999999 qry = { "code": item["code"], "start_date_stamp": item["start_date_stamp"], "end_date_stamp": item["end_date_stamp"], "ann_date_stamp": item["ann_date_stamp"], } if coll.count_documents(qry) == 0: coll.insert_one(item)
def QA_ts_update_stock_basic(): """ 本地化所有股票基本信息 """ coll = DATABASE.stock_basic coll.create_index( [("code", ASCENDING), ("status", ASCENDING), ("list_date_stamp", ASCENDING)], unique=True, ) # 初始化数据接口 pro = get_pro() # 获取所有数据 df_1 = pro.stock_basic(exchange="", list_status="L") df_2 = pro.stock_basic(exchange="", list_status="P") df_3 = pro.stock_basic(exchange="", list_status="D") df_1["status"] = "L" df_2["status"] = "P" df_3["status"] = "D" df = df_1.append(df_2).append(df_3) df["code"] = QA_fmt_code_list(df.ts_code) df["list_date_stamp"] = df.list_date.apply(QA_util_date_stamp) df = df.where(pd.notnull(df), None) js = QA_util_to_json_from_pandas(df.drop(columns=["ts_code", "symbol"])) for item in js: qry = { "code": item["code"], "status": item["status"], "list_date_stamp": item["list_date_stamp"], } if coll.count_documents(qry) == 0: # 增量更新 coll.insert_one(item)
def _get_crosssection_financial(report_date, report_type, sheet_type, fields, wait_seconds, trial_count): nonlocal pro, max_trial if trial_count >= max_trial: raise ValueError("[ERROR]\tEXCEED MAX TRIAL!") try: if not fields: print( f"pro.{sheet_type}_vip(period='{report_date}', report_type={report_type})" ) df = eval( f"pro.{sheet_type}_vip(period='{report_date}', report_type={report_type})" ) else: df = eval( f"pro.{sheet_type}_vip(period='{report_date}', report_type={report_type}, fields={fields})" ) if df.empty: return df df.ts_code = QA_fmt_code_list(df.ts_code) return df.rename(columns={ "ts_code": "code", "end_date": "report_date" }).sort_values(by=['ann_date', 'f_ann_date']) except Exception as e: print(e) time.sleep(wait_seconds) _get_crosssection_financial(report_date, report_type, sheet_type, fields, wait_seconds, trial_count + 1)
def QA_fetch_get_factor_start_date(factor: pd.Series) -> pd.DataFrame: """ 获取因子池上市时间, 注意,请自行登陆聚宽本地 sdk 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] 返回值 --- :return: 因子数据 """ # 因子格式化 factor = QA_fmt_factor(factor.copy()) merged_data = pd.DataFrame(factor.rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates(), style="jq") # 上市时间获取 df_local = jqdatasdk.get_all_securities(types="stock") intersection = df_local.index.intersection(stock_list) ss = df_local.loc[intersection]["start_date"] ss.index = ss.index.map(lambda x: x[:6]) # 拼接上市时间 merged_data = merged_data.loc[(slice(None), list(ss.index)), :] merged_data["start_date"] = merged_data.index.map( lambda x: ss.loc[x[1]]).tolist() return merged_data
def QA_fetch_factor_start_date(factor: pd.Series) -> pd.DataFrame: """ 获取因子池上市时间,本地获取接口,使用前先保存股票基本信息 """ factor = QA_fmt_factor(factor.copy()) merged_data = pd.DataFrame(factor.rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates()) # 上市时间获取 df_local = QA_fetch_stock_basic(status=None).set_index("code") intersection = df_local.index.intersection(stock_list) ss = df_local.loc[intersection]["list_date"] # 拼接上市时间 merged_data = merged_data.loc[(slice(None), list(ss.index)), :] merged_data["start_date"] = merged_data.index.map( lambda x: ss.loc[x[1]]).tolist() return merged_data
def QA_ts_update_industry( level: Union[str, List, Tuple] = ["L1", "L2", "L3"], src: Union[str, List, Tuple] = "SW2021", ): """ 保存个股行业信息 """ pro = get_pro() if isinstance(level, str): level = [level] if isinstance(src, str): src = [src] df_industry = pd.DataFrame() for s in src: for lv in level: try: df_tmp = pro.index_classify(level=lv, src=s) df_tmp["src"] = "sw" df_industry = df_industry.append(df_tmp) except Exception as e1: print(e1) time.sleep(1) try: df_tmp = pro.index_classify(level=lv, src=s) df_tmp["src"] = "sw" df_industry = df_industry.append(df_tmp) except Exception as e2: raise ValueError(e2) df_results = pd.DataFrame() for idx, item in df_industry.iterrows(): time.sleep(0.3) if idx % 100 == 0: print(f"currently saving {idx}th record") try: df_tmp = pro.index_member(index_code=item["index_code"]) df_tmp["industry_name"] = item["industry_name"] df_tmp["level"] = item["level"].lower() df_tmp["src"] = item["src"].lower() df_results = df_results.append(df_tmp) except Exception as e1: print(e1) time.sleep(1) try: df_tmp = pro.index_member(index_code=item["index_code"]) df_tmp["industry_name"] = item["industry_name"] df_tmp["level"] = item["level"].lower() df_tmp["src"] = item["src"].lower() df_results = df_results.append(df_tmp) except Exception as e2: raise ValueError(e2) df_results.con_code = QA_fmt_code_list(df_results.con_code) df_results = df_results.rename(columns={"con_code": "code"}) df_results = df_results.sort_values(by="code") coll = DATABASE.industry coll.create_index( [ ("code", ASCENDING), ("level", ASCENDING), ("src", ASCENDING), ("in_date_stamp", DESCENDING), ("out_date_stamp", DESCENDING), ], unique=False, ) for item in QA_util_to_json_from_pandas(df_results): item["in_date_stamp"] = QA_util_date_stamp(item["in_date"]) if not item["out_date"]: item["out_date_stamp"] = 9999999999 else: item["out_date_stamp"] = QA_util_date_stamp(item["out_date_stamp"]) coll.update_one( { "code": item["code"], "level": item["level"], "src": item["src"], "in_date_stamp": item["in_date_stamp"], }, {"$set": item}, upsert=True, ) print('finished saving industry')
def QA_fetch_get_factor_groupby(factor: pd.Series, industry_cls: str = "sw_l1", detailed: bool = False) -> pd.DataFrame: """ 获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列 因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆) 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] :param industry_cls: 行业分类,默认为申万 1 级行业 :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息 返回值 --- :return: 因子数据, 包括因子值,因子对应行业 """ warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning) # 因子格式化 factor = QA_fmt_factor(factor) merged_data = pd.DataFrame(factor.copy().rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates(), style="jq") # 非详细模式, 行业数据采用当前日期 if detailed: # start_time = str(min(factor.index.get_level_values("datetime")))[:10] # end_time = str(max(factor.index.get_level_values("datetime")))[:10] # date_range = list( # map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time)) # ) date_range = (factor.index.get_level_values( "datetime").drop_duplicates().tolist()) df_local = pd.DataFrame() industries = map(partial(jqdatasdk.get_industry, stock_list), date_range) industries = { d: { s: ind.get(s).get(industry_cls, dict()).get("industry_name", "NA") for s in stock_list } for d, ind in zip(date_range, industries) } else: end_time = str(max(factor.index.get_level_values("datetime")))[:10] date_range = [pd.Timestamp(end_time)] industries = jqdatasdk.get_industry(stock_list, end_time) industries = { d: { s: industries.get(s).get(industry_cls, dict()).get("industry_name", "NA") for s in stock_list } for d in date_range } # 可能历史上股票没有行业信息,用之后的行业信息往前填充 df_local = pd.DataFrame(industries).T.sort_index() df_local.columns = df_local.columns.map(str).str.slice(0, 6) ss_local = df_local.stack(level=-1) ss_local.index.names = ["date", "code"] merged_data["date"] = merged_data.index.get_level_values("datetime").map( lambda x: x.date()) merged_data = (merged_data.reset_index().set_index([ "date", "code" ]).assign(group=ss_local).reset_index().set_index(["datetime", "code"]).drop("date", axis=1)) group = merged_data["group"].unstack().bfill().stack() merged_data["group"] = group return merged_data
def QA_fetch_get_individual_financial(code: str, start: Union[str, datetime.datetime, pd.Timestamp] = None, end: Union[str, datetime.datetime, pd.Timestamp] = None, report_date: Union[ str, datetime.datetime] = None, sheet_type: str = "income", report_type: Union[int, str] = 1, fields: Union[str, Tuple, List] = None, wait_seconds: int = 61, max_trial: int = 3) -> pd.DataFrame: """个股财务报表网络查询接口,注意,这里的 start 与 end 是针对 report_date 进行范围查询 Args: code (str): 股票代码 start (Union[str, datetime.datetime, pd.Timestamp], optional): 查询起始时间,默认为 None end (Union[str, datetime.datetime, pd.Timestamp], optional): 查询结束时间,默认为 None report_date (Union[str, datetime.datetime], optional): 报告期. 默认为 None,如果使用了 report_date, 则 start 与 end 参数不再起作用 sheet_type (str, optional): 报表类型,默认为 "income" 类型 (利润表 "income"| 资产负债表 "balancesheet"| 现金流量表 "cashflow"| 业绩预告 "forecast"| 业绩快报 "express") report_type (Union[int, str], optional): 报告类型. 默认为 1。 (1 合并报表 上市公司最新报表(默认)| 2 单季合并 单一季度的合并报表 | 3 调整单季合并表 调整后的单季合并报表(如果有) | 4 调整合并报表 本年度公布上年同期的财务报表数据,报告期为上年度 | 5 调整前合并报表 数据发生变更,将原数据进行保留,即调整前的原数据 | 6 母公司报表 该公司母公司的财务报表数据 | 7 母公司单季表 母公司的单季度表 | 8 母公司调整单季表 母公司调整后的单季表 | 9 母公司调整表 该公司母公司的本年度公布上年同期的财务报表数据 | 10 母公司调整前报表 母公司调整之前的原始财务报表数据 | 11 调整前合并报表 调整之前合并报表原数据 | 12 母公司调整前报表 母公司报表发生变更前保留的原数据) fields (Union[str, Tuple, List], optional): 指定数据范围,如果设置为 None,则返回所有数据. 默认为 None. wait_seconds (int, optional): 等待重试时间. 默认为 61 秒. max_trial (int, optional): 最大重试次数. 默认为 3. Returns: pd.DataFrame: 返回指定个股时间范围内指定类型的报表数据 """ def _get_individual_financial(code, report_date, report_type, sheet_type, fields, wait_seconds, trial_count): nonlocal pro, max_trial if trial_count >= max_trial: raise ValueError("[ERROR]\tEXCEED MAX TRIAL!") try: if not fields: df = eval( f"pro.{sheet_type}(ts_code='{code}', period='{report_date}', report_type={report_type})" ) else: df = eval( f"pro.{sheet_type}(ts_code='{code}', period='{report_date}', report_type={report_type}, fields={fields})" ) return df.rename(columns={ "ts_code": "code", "end_date": "report_date" }) except Exception as e: print(e) time.sleep(wait_seconds) _get_individual_financial(code, report_date, report_type, sheet_type, fields, wait_seconds, trial_count + 1) pro = get_pro() report_type = int(report_type) if (not start) and (not end) and (not report_date): raise ValueError( "[QRY_DATES ERROR]\tparam 'start', 'end' and 'report_date' should not be none at the same time!" ) if isinstance(fields, str): fields = sorted( list( set([ fields, "ts_code", "end_date", "ann_date", "f_ann_date", "report_type", "update_flag" ]))) if report_date: report_date = pd.Timestamp(report_date) year = report_date.year report_date_lists = [ pd.Timestamp(str(year) + report_date_tail) for report_date_tail in REPORT_DATE_TAILS ] if report_date not in report_date_lists: raise ValueError("[REPORT_DATE ERROR]") if sheet_type not in [ "income", "balancesheet", "cashflow", "forecast", "express" ]: raise ValueError("[SHEET_TYPE ERROR]") if report_type not in range(1, 13): raise ValueError("[REPORT_TYPE ERROR]") report_dates = [report_date] else: start = pd.Timestamp(start) start_year = start.year end = pd.Timestamp(end) end_year = end.year origin_year_ranges = pd.date_range(str(start_year), str(end_year + 1), freq='Y').map(str).str.slice( 0, 4).tolist() origin_report_ranges = pd.Series([ pd.Timestamp(year + report_date_tail) for year in origin_year_ranges for report_date_tail in REPORT_DATE_TAILS ]) report_dates = origin_report_ranges.loc[ (origin_report_ranges >= start) & (origin_report_ranges <= end)] df = pd.DataFrame() for report_date in report_dates: df = df.append( _get_individual_financial( code=QA_fmt_code(code, "ts"), report_date=report_date.strftime("%Y%m%d"), report_type=report_type, sheet_type=sheet_type, fields=fields, wait_seconds=wait_seconds, trial_count=0)) df.code = QA_fmt_code_list(df.code) return df.reset_index(drop=True)
def QA_fetch_get_daily_basic(code: Union[str, List, Tuple] = None, trade_date: Union[str, pd.Timestamp, datetime.datetime] = None, fields: Union[str, List, Tuple] = None, wait_seconds: int = 61, max_trial: int = 3) -> pd.DataFrame: """ 从网络获取市场指定交易日重要基本面指标,用于选股分析和报表展示 Args: code(Union[str, List, Tuple], optional): 指定股票代码,默认为 None,即对应交易日的全市场股票 trade_date(Union[str, pd.Timestamp, datetime.datetime], optional): 指定交易日期, 默认为 None, 即距离当前 日期最近的交易日 fields(Union[str, List, Tuple], optional): 默认为 None,如果指定为某一单个 str,默认返回 DataFrame 包括 交易日等附加信息 wait_seconds (int, optional): 查询超时时间, 默认为 61. max_trial (int, optional): 查询最大尝试次数, 默认为 3. Returns: pd.DataFrame: 指定交易日指定范围指定标的的每日基本面指标信息 """ def _fetch_get_daily_basic(trade_date, fields, trial_count): nonlocal pro, max_trial try: if trial_count >= max_trial: raise ValueError("[ERROR]\tEXCEED MAX TRIAL!") if not trade_date: trade_date = QA_util_get_pre_trade_date( datetime.date.today(), 1).replace("-", "") else: trade_date = pd.Timestamp(trade_date).strftime("%Y%m%d") if not fields: qry = f"pro.daily_basic(trade_date={trade_date})" else: if isinstance(fields, str): fields = list(set([fields] + ["ts_code", "trade_date"])) fields = ",".join(fields) qry = "pro.daily_basic(trade_date={trade_date}, fields={fields})" df = eval(qry) if df is None: raise ValueError("[ERROR]") return df except: time.sleep(61) _fetch_get_daily_basic(trade_date, fields, trial_count + 1) pro = get_pro() df = _fetch_get_daily_basic(trade_date=trade_date, fields=fields, trial_count=0) if df.empty: return df else: df = df.rename(columns={"ts_code": "code"}) df.code = QA_fmt_code_list(df.code) df = df.set_index("code") if not code: return df if isinstance(code, str): code = (code, ) # exclude code which not in rtn dataframe filter_idx = df.index.intersection(code) return df.loc[filter_idx]
def QA_fetch_get_factor_groupby( factor: pd.Series, industry_cls: str = "sw_l1", detailed: bool = False ) -> pd.DataFrame: """ 获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列 因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆) 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] :param industry_cls: 行业分类,默认为申万 1 级行业 :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息 返回值 --- :return: 因子数据, 包括因子值,因子对应行业 """ warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning) # 因子格式化 factor = QA_fmt_factor(factor) merged_data = pd.DataFrame(factor.copy().rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates(), ) # 非详细模式, 行业数据采用当前日期 ss = pd.Series() if detailed: # start_time = str(min(factor.index.get_level_values("datetime")))[:10] # end_time = str(max(factor.index.get_level_values("datetime")))[:10] # date_range = list( # map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time)) # ) date_range = ( factor.index.get_level_values( "datetime").drop_duplicates().tolist() ) industry = pd.DataFrame() for cursor_date in date_range: df_tmp = QA_fetch_industry_adv(code = stock_list, cursor_date = cursor_date)[["code", "industry_name"]] df_tmp["date"] = cursor_date industry = industry.append(df_tmp) ss = industry.set_index(["date", "code"])["industry_name"] # industries = map( # partial(jqdatasdk.get_industry, # stock_list), # date_range # ) # industries = { # d: { # s: ind.get(s).get(industry_cls, # dict()).get("industry_name", # "NA") # for s in stock_list # } # for d, # ind in zip(date_range, # industries) # } else: end_time = str(max(factor.index.get_level_values("datetime")))[:10] date_range = [pd.Timestamp(end_time)] # industries = jqdatasdk.get_industry(stock_list, end_time) ss = QA_fetch_industry_adv(stock_list, end_time)[["code", "industry_name"]].set_index(["date", "code"])["industry_name"] # industries = { # d: { # s: industries.get(s).get(industry_cls, # dict()).get("industry_name", # "NA") # for s in stock_list # } # for d in date_range # } # 可能历史上股票没有行业信息,用之后的行业信息往前填充 merged_data["date"] = merged_data.index.get_level_values("datetime").map( lambda x: x.date() ) merged_data = ( merged_data.reset_index().set_index( ["date", "code"] ).assign(group=ss).reset_index().set_index(["datetime", "code"] ).drop("date", axis=1) ) group = merged_data["group"].unstack().bfill().stack() merged_data["group"] = group return merged_data