def populate_sh_etf_type(self, df: pd.DataFrame, http_session): """ 填充沪市 ETF 代码对应的 TYPE 到列表数据中 :param df: ETF 列表数据 :return: 包含 ETF 对应 TYPE 的列表数据 """ query_url = 'http://query.sse.com.cn/infodisplay/queryETFNewAllInfo.do?' \ 'isPagination=false&type={}&pageHelp.pageSize=25' type_df = pd.DataFrame() for etf_class in [1, 2]: url = query_url.format(etf_class) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_df = pd.DataFrame(response_dict.get('result', [])) response_df = response_df[['fundid1', 'etftype']] type_df = pd.concat([type_df, response_df]) result_df = df.copy() result_df = result_df.sort_values(by='FUND_ID').reset_index(drop=True) type_df = type_df.sort_values(by='fundid1').reset_index(drop=True) result_df['ETF_TYPE'] = type_df['etftype'] return result_df
async def download_sh_etf_component(self, df: pd.DataFrame, http_session, db_session): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \ 'isPagination=false&type={}&etfClass={}' etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')] etf_df = self.populate_sh_etf_type(etf_df, http_session) for _, etf in etf_df.iterrows(): url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS']) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue try: response_dict = demjson.decode(text) except Exception as e: self.logger.error( f'decode {url} failed with text: {text}, error as: {e}') continue response_df = pd.DataFrame(response_dict.get('result', [])) etf_code = etf['FUND_ID'] etf_id = f'etf_sh_{etf_code}' response_df = response_df[['instrumentId', 'instrumentName']].copy() response_df.rename(columns={ 'instrumentId': 'stock_code', 'instrumentName': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = ChnExchange.SSE.value response_df['code'] = etf_code response_df['name'] = etf['FUND_NAME'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')
async def run(self): http_session = get_sync_http_session() db_session = get_db_session(self.region, self.provider, self.data_schema) # 抓取沪市 ETF 列表 url = 'http://query.sse.com.cn/commonQuery.do?sqlId=COMMON_SSE_ZQPZ_ETFLB_L_NEW' text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: return response_dict = demjson.decode(text) df = pd.DataFrame(response_dict.get('result', [])) await self.persist_etf_list(df, ChnExchange.SSE.value, db_session) self.logger.info('沪市 ETF 列表抓取完成...') # 抓取沪市 ETF 成分股 await self.download_sh_etf_component(df, http_session, db_session) self.logger.info('沪市 ETF 成分股抓取完成...') # 抓取深市 ETF 列表 url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1945' content = sync_get(http_session, url, return_type='content') if content is None: return df = pd.read_excel(io.BytesIO(content), dtype=str) await self.persist_etf_list(df, ChnExchange.SZSE.value, db_session) self.logger.info('深市 ETF 列表抓取完成...') # 抓取深市 ETF 成分股 await self.download_sz_etf_component(df, http_session, db_session) self.logger.info('深市 ETF 成分股抓取完成...')
async def fetch_cni_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取国证指数成分股 """ query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' try: response_df = response_df[['样本股代码']] except KeyError: response_df = response_df[['证券代码']] response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df.columns = ['stock_code'] response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def fetch_szse_index(self, http_session, db_session) -> None: """ 抓取深证指数列表 """ url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1812_zs&TABKEY=tab1' content = sync_get(http_session, url, return_type='content') if content is None: return df = pd.read_excel(io.BytesIO(content), dtype='str') df.columns = ['code', 'name', 'timestamp', 'base_point', 'list_date'] df['category'] = 'szse' df = df.loc[df['code'].str.contains(r'^\d{6}$')] await self.persist_index(df) self.logger.info('深证指数列表抓取完成...') # 抓取深证指数成分股 await self.fetch_szse_index_component(df, http_session, db_session) self.logger.info('深证指数成分股抓取完成...')
async def fetch_csi_index(self, http_session, db_session) -> None: """ 抓取上证、中证指数列表 """ url = 'http://www.csindex.com.cn/zh-CN/indices/index' \ '?page={}&page_size={}&data_type=json&class_1=1&class_2=2&class_7=7&class_10=10' index_list = [] page = 1 page_size = 50 while True: query_url = url.format(page, page_size) text = sync_get(http_session, query_url, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_index_list = response_dict.get('list', []) if len(response_index_list) == 0: break index_list.extend(response_index_list) self.logger.info(f'上证、中证指数第 {page} 页抓取完成...') page += 1 await self.sleep() df = pd.DataFrame(index_list) df = df[['base_date', 'base_point', 'index_code', 'indx_sname', 'online_date', 'class_eseries']].copy() df.columns = ['timestamp', 'base_point', 'code', 'name', 'list_date', 'class_eseries'] df['category'] = df['class_eseries'].apply(lambda x: x.split(' ')[0].lower()) df = df.drop('class_eseries', axis=1) df = df.loc[df['code'].str.contains(r'^\d{6}$')] await self.persist_index(df) self.logger.info('上证、中证指数列表抓取完成...') # 抓取上证、中证指数成分股 await self.fetch_csi_index_component(df, http_session, db_session) self.logger.info('上证、中证指数成分股抓取完成...')
async def fetch_csi_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误') continue response_df = pd.read_excel(io.BytesIO(content)) response_df = response_df[['成分券代码Constituent Code', '成分券名称Constituent Name']].rename( columns={'成分券代码Constituent Code': 'stock_code', '成分券名称Constituent Name': 'stock_name'}) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def fetch_szse_index_component(self, df: pd.DataFrame, http_session, db_session): """ 抓取深证指数成分股 """ query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') await self.sleep()
async def fetch_cni_index(self, http_session) -> None: """ 抓取国证指数列表 """ url = 'http://www.cnindex.com.cn/zstx/jcxl/' text = sync_get(http_session, url, return_type='text') if text is None: return dfs = pd.read_html(text) # 第 9 个 table 之后为非股票指数 dfs = dfs[1:9] result_df = pd.DataFrame() for df in dfs: header = df.iloc[0] df = df[1:] df.columns = header df.astype('str') result_df = pd.concat([result_df, df]) result_df = result_df.drop('样本股数量', axis=1) result_df.columns = ['name', 'code', 'timestamp', 'base_point', 'list_date'] result_df['timestamp'] = result_df['timestamp'].apply(lambda x: x.replace('-', '')) result_df['list_date'] = result_df['list_date'].apply(lambda x: x.replace('-', '')) result_df['category'] = 'csi' result_df = result_df.loc[result_df['code'].str.contains(r'^\d{6}$')] await self.persist_index(result_df) self.logger.info('国证指数列表抓取完成...') # 抓取国证指数成分股 await self.fetch_cni_index_component(result_df, http_session) self.logger.info('国证指数成分股抓取完成...')
async def download_sz_etf_component(self, df: pd.DataFrame, http_session, db_session): query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml' self.parse_sz_etf_underlying_index(df) for _, etf in df.iterrows(): underlying_index = etf['拟合指数'] etf_code = etf['证券代码'] if len(underlying_index) == 0: self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...') continue url = query_url.format(underlying_index) text = sync_get(http_session, url, encoding='gbk', return_type='text') if text is None: continue try: dfs = pd.read_html(text, header=1) except ValueError as error: self.logger.error( f'HTML parse error: {error}, response: {text}') continue if len(dfs) < 4: continue response_df = dfs[3].copy() response_df = response_df.dropna(axis=1, how='any') response_df['品种代码'] = response_df['品种代码'].apply( lambda x: f'{x:06d}') etf_id = f'etf_sz_{etf_code}' response_df = response_df[['品种代码', '品种名称']].copy() response_df.rename(columns={ '品种代码': 'stock_code', '品种名称': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = ChnExchange.SZSE.value response_df['code'] = etf_code response_df['name'] = etf['证券简称'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') await df_to_db(region=self.region, provider=self.provider, data_schema=self.data_schema, db_session=db_session, df=response_df) self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')