def run(self): http_session = get_http_session(self.mode) # 抓取沪市 ETF 列表 url = 'http://query.sse.com.cn/commonQuery.do?sqlId=COMMON_SSE_ZQPZ_ETFLB_L_NEW' text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: return response_dict = demjson.decode(text) df = pd.DataFrame(response_dict.get('result', [])) self.persist_etf_list(df, exchange='sh') self.logger.info('沪市 ETF 列表抓取完成...') # 抓取沪市 ETF 成分股 self.download_sh_etf_component(df, http_session) self.logger.info('沪市 ETF 成分股抓取完成...') # 抓取深市 ETF 列表 url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1945' content = sync_get(http_session, url, return_type='content') if content is None: return df = pd.read_excel(io.BytesIO(content), dtype=str) self.persist_etf_list(df, exchange='sz') self.logger.info('深市 ETF 列表抓取完成...') # 抓取深市 ETF 成分股 self.download_sz_etf_component(df, http_session) self.logger.info('深市 ETF 成分股抓取完成...')
def populate_sh_etf_type(self, df: pd.DataFrame, http_session): """ 填充沪市 ETF 代码对应的 TYPE 到列表数据中 :param df: ETF 列表数据 :return: 包含 ETF 对应 TYPE 的列表数据 """ query_url = 'http://query.sse.com.cn/infodisplay/queryETFNewAllInfo.do?' \ 'isPagination=false&type={}&pageHelp.pageSize=25' type_df = pd.DataFrame() for etf_class in [1, 2]: url = query_url.format(etf_class) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_df = pd.DataFrame(response_dict.get('result', [])) response_df = response_df[['fundid1', 'etftype']] type_df = pd.concat([type_df, response_df]) result_df = df.copy() result_df = result_df.sort_values(by='FUND_ID').reset_index(drop=True) type_df = type_df.sort_values(by='fundid1').reset_index(drop=True) result_df['ETF_TYPE'] = type_df['etftype'] return result_df
def fetch_szse_index_component(self, df: pd.DataFrame, http_session): """ 抓取深证指数成分股 """ query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def record(self, entity, start, end, size, timestamps, http_session): for page in range(1, 5): text = sync_get(http_session, self.category_stocks_url.format(page, entity.code), return_type='text') if text is None or text == 'null': break category_jsons = demjson.decode(text) # @njit(nopython=True) def numba_boost_up(category_jsons): the_list = [] for category in category_jsons: stock_code = category['code'] stock_id = china_stock_code_to_id(stock_code) the_list.append({ 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': category['name'], }) return the_list the_list = numba_boost_up(category_jsons) if the_list: return pd.DataFrame.from_records(the_list) return None
def record(self, entity, start, end, size, timestamps, http_session): # 此 url 不支持分页,如果超过我们想取的条数,则只能取最大条数 if start is None or size > self.default_size: size = 8000 param = { 'security_item': entity, 'level': self.level.value, 'size': size } security_item = param['security_item'] size = param['size'] url = ChinaETFDayKdataRecorder.url.format(security_item.exchange, security_item.code, size) text = sync_get(http_session, url, return_type='text') if text is None: return None response_json = demjson.decode(text) if response_json is None or len(response_json) == 0: return None df = pd.DataFrame(response_json) df['level'] = param['level'] return df
def record(self, entity, start, end, size, timestamps, http_session): url = self.category_stocks_url.format(entity.code, '1') text = sync_get(http_session, url, return_type='text') if text is None: return None results = json_callback_param(text) # @njit(nopython=True) def numba_boost_up(results): the_list = [] for result in results: items = result.split(',') stock_code = items[1] stock_id = china_stock_code_to_id(stock_code) the_list.append({ 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': items[2], }) return the_list the_list = numba_boost_up(results) if the_list: df = pd.DataFrame.from_records(the_list) return df self.sleep() return None
def refresh_token(self, http_session): resp = sync_get(http_session, self.GET_TOKEN_URL) self.logger.info("refresh_token resp.status_code:{}, resp.text:{}".format(resp.status_code, resp.text)) if resp.status_code == 200 and resp.json() and 'access_token' in resp.json(): self.token = resp.json()['access_token'] else: self.logger.exception("could not refresh_token")
def download_sh_etf_component(self, df: pd.DataFrame, http_session): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \ 'isPagination=false&type={}&etfClass={}' etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')] etf_df = self.populate_sh_etf_type(etf_df, http_session) for _, etf in etf_df.iterrows(): url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS']) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_df = pd.DataFrame(response_dict.get('result', [])) etf_code = etf['FUND_ID'] etf_id = f'etf_sh_{etf_code}' response_df = response_df[['instrumentId', 'instrumentName']].copy() response_df.rename(columns={ 'instrumentId': 'stock_code', 'instrumentName': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = 'sh' response_df['code'] = etf_code response_df['name'] = etf['FUND_NAME'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') df_to_db(df=response_df, ref_df=None, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...') self.sleep()
def record(self, entity, start, end, size, timestamps, http_session): the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN)) if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] param = { 'security_item': entity, 'quarters': the_quarters, 'level': self.level.value } security_item = param['security_item'] quarters = param['quarters'] level = param['level'] result_df = pd.DataFrame() for year, quarter in quarters: query_url = self.url.format(security_item.code, year, quarter) text = sync_get(http_session, query_url, encoding='gbk', return_type='text') if text is None: continue try: dfs = pd.read_html(text) except ValueError as error: self.logger.error( f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})' ) self.sleep() continue if len(dfs) < 5: self.sleep() continue df = dfs[4].copy() df = df.iloc[1:] df.columns = [ 'timestamp', 'open', 'high', 'close', 'low', 'volume', 'turnover' ] result_df = pd.concat([result_df, df]) self.sleep() if pd_is_not_null(result_df): result_df['level'] = level return result_df return None
def record(self, entity, start, end, size, timestamps, http_session): json_results = [] for timestamp in timestamps: timestamp_str = to_time_str(timestamp) url = self.url.format(timestamp_str) text = sync_get(http_session, url=url, headers=DEFAULT_SH_SUMMARY_HEADER, return_type='text') if text is None: continue results = demjson.decode(text[text.index("(") + 1:text.index(")")])['result'] result = [ result for result in results if result['productType'] == '1' ] if result and len(result) == 1: result_json = result[0] # 有些较老的数据不存在,默认设为0.0 json_results.append({ 'timestamp': timestamp, 'pe': to_float(result_json['profitRate'], 0.0), 'total_value': to_float(result_json['marketValue1'] + '亿', 0.0), 'total_tradable_vaule': to_float(result_json['negotiableValue1'] + '亿', 0.0), 'volume': to_float(result_json['trdVol1'] + '万', 0.0), 'turnover': to_float(result_json['trdAmt1'] + '亿', 0.0), 'turnover_rate': to_float(result_json['exchangeRate'], 0.0), }) if len(json_results) > self.batch_size: df = pd.DataFrame.from_records(json_results) df['entity_id'] = entity.id df['provider'] = Provider.Exchange.value df['timestamp'] = pd.to_datetime(df['timestamp']) df['name'] = '上证指数' return df if len(json_results) > 0: df = pd.DataFrame.from_records(json_results) return df return None
def fetch_csi_index(self, http_session) -> None: """ 抓取上证、中证指数列表 """ url = 'http://www.csindex.com.cn/zh-CN/indices/index' \ '?page={}&page_size={}&data_type=json&class_1=1&class_2=2&class_7=7&class_10=10' index_list = [] page = 1 page_size = 50 while True: query_url = url.format(page, page_size) text = sync_get(http_session, query_url, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_index_list = response_dict.get('list', []) if len(response_index_list) == 0: break index_list.extend(response_index_list) self.logger.info(f'上证、中证指数第 {page} 页抓取完成...') page += 1 self.sleep() df = pd.DataFrame(index_list) df = df[[ 'base_date', 'base_point', 'index_code', 'indx_sname', 'online_date', 'class_eseries' ]].copy() df.columns = [ 'timestamp', 'base_point', 'code', 'name', 'list_date', 'class_eseries' ] df['category'] = df['class_eseries'].apply( lambda x: x.split(' ')[0].lower()) df = df.drop('class_eseries', axis=1) df = df.loc[df['code'].str.contains(r'^\d{6}$')] self.persist_index(df) self.logger.info('上证、中证指数列表抓取完成...') # 抓取上证、中证指数成分股 self.fetch_csi_index_component(df, http_session) self.logger.info('上证、中证指数成分股抓取完成...')
def fetch_csi_index_component(self, df: pd.DataFrame, http_session): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误') continue response_df = pd.read_excel(io.BytesIO(content)) response_df = response_df[[ '成分券代码Constituent Code', '成分券名称Constituent Name' ]].rename( columns={ '成分券代码Constituent Code': 'stock_code', '成分券名称Constituent Name': 'stock_name' }) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def record(self, entity, start, end, size, timestamps, http_session): url = self.generate_url(category=entity.category, code=entity.code, number=size) text = sync_get(http_session, url, return_type='text') if text is None: return None json_list = eval(text) if len(json_list) == 0: return None # @njit(nopython=True) def numba_boost_up(json_list): result_list = [] for item in json_list: result_list.append({ 'name': entity.name, 'timestamp': to_pd_timestamp(item['opendate']), 'close': to_float(item['avg_price']), 'change_pct': to_float(item['avg_changeratio']), 'turnover_rate': to_float(item['turnover']) / 10000, 'net_inflows': to_float(item['netamount']), 'net_inflow_rate': to_float(item['ratioamount']), 'net_main_inflows': to_float(item['r0_net']), 'net_main_inflow_rate': to_float(item['r0_ratio']) }) return result_list result_list = numba_boost_up(json_list) if len(result_list) > 0: df = pd.DataFrame.from_records(result_list) return df return None
def process_loop(self, entity, http_session): url = self.category_map_url.get(entity, None) if url is None: return resp = sync_get(http_session, url, encoding='GB2312', headers=self.category_map_header[entity]) if resp.status_code != 200: return df = self.format(resp=resp, exchange=entity) if pd_is_not_null(df): self.persist(df) return None
def process_loop(self, entity, http_session): url = 'https://api.nasdaq.com/api/screener/stocks' params = {'download': 'true', 'exchange': entity} resp = sync_get(http_session, url, headers=YAHOO_STOCK_LIST_HEADER, params=params, enable_proxy=False) if resp is None: return json = resp.json()['data']['rows'] if len(json) > 0: df = self.format(content=json, exchange=entity) self.persist(df) return None
def fetch_cni_index_component(self, df: pd.DataFrame, http_session): """ 抓取国证指数成分股 """ query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' try: response_df = response_df[['样本股代码']] except KeyError: response_df = response_df[['证券代码']] response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df.columns = ['stock_code'] response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def fetch_szse_index(self, http_session) -> None: """ 抓取深证指数列表 """ url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1812_zs&TABKEY=tab1' content = sync_get(http_session, url, return_type='content') if content is None: return df = pd.read_excel(io.BytesIO(content), dtype='str') df.columns = ['code', 'name', 'timestamp', 'base_point', 'list_date'] df['category'] = 'szse' df = df.loc[df['code'].str.contains(r'^\d{6}$')] self.persist_index(df) self.logger.info('深证指数列表抓取完成...') # 抓取深证指数成分股 self.fetch_szse_index_component(df, http_session) self.logger.info('深证指数成分股抓取完成...')
def fetch_cumulative_net_value(self, security_item, start, end, http_session) -> pd.DataFrame: query_url = 'http://api.fund.eastmoney.com/f10/lsjz?' \ 'fundCode={}&pageIndex={}&pageSize=200&startDate={}&endDate={}' page = 1 df = pd.DataFrame() while True: url = query_url.format(security_item.code, page, to_time_str(start), to_time_str(end)) text = sync_get(http_session, url, headers=EASTMONEY_ETF_NET_VALUE_HEADER, return_type='text') if text is None: break try: response_json = demjson.decode(text) response_df = pd.DataFrame(response_json['Data']['LSJZList']) except: break # 最后一页 if not pd_is_not_null(response_df): break response_df['FSRQ'] = pd.to_datetime(response_df['FSRQ']) response_df['JZZZL'] = pd.to_numeric(response_df['JZZZL'], errors='coerce') response_df['LJJZ'] = pd.to_numeric(response_df['LJJZ'], errors='coerce') response_df = response_df.fillna(0) response_df.set_index('FSRQ', inplace=True, drop=True) df = pd.concat([df, response_df]) page += 1 self.sleep() return df
def fetch_cni_index(self, http_session) -> None: """ 抓取国证指数列表 """ url = 'http://www.cnindex.com.cn/zstx/jcxl/' text = sync_get(http_session, url, return_type='text') if text is None: return dfs = pd.read_html(text) # 第 9 个 table 之后为非股票指数 dfs = dfs[1:9] result_df = pd.DataFrame() for df in dfs: header = df.iloc[0] df = df[1:] df.columns = header df.astype('str') result_df = pd.concat([result_df, df]) result_df = result_df.drop('样本股数量', axis=1) result_df.columns = [ 'name', 'code', 'timestamp', 'base_point', 'list_date' ] result_df['timestamp'] = result_df['timestamp'].apply( lambda x: x.replace('-', '')) result_df['list_date'] = result_df['list_date'].apply( lambda x: x.replace('-', '')) result_df['category'] = 'csi' result_df = result_df.loc[result_df['code'].str.contains(r'^\d{6}$')] self.persist_index(result_df) self.logger.info('国证指数列表抓取完成...') # 抓取国证指数成分股 self.fetch_cni_index_component(result_df, http_session) self.logger.info('国证指数成分股抓取完成...')
def process_loop(self, entity, http_session): text = sync_get(http_session, self.category_map_url[entity], encoding='gbk', return_type='text') if text is None: return json_str = text[text.index('{'):text.index('}') + 1] tmp_json = json.loads(json_str) @njit(nopython=True) def numba_boost_up(tmp_json): the_list = [] for code in tmp_json: name = tmp_json[code].split(',')[1] entity_id = f'block_cn_{code}' the_list.append({ 'id': entity_id, 'entity_id': entity_id, 'entity_type': EntityType.Block.value, 'exchange': 'cn', 'code': code, 'name': name, 'category': entity.value }) return the_list the_list = numba_boost_up(tmp_json) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(df=df, ref_df=None, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f"finish record sina blocks:{entity.value}")
def process_loop(self, entity, http_session): category, url = entity text = sync_get(http_session, url, return_type='text') if text is None: return results = json_callback_param(text) @njit(nopython=True) def numba_boost_up(results): the_list = [] for result in results: items = result.split(',') code = items[1] name = items[2] entity_id = f'block_cn_{code}' the_list.append({ 'id': entity_id, 'entity_id': entity_id, 'entity_type': EntityType.Block.value, 'exchange': 'cn', 'code': code, 'name': name, 'category': category.value }) return the_list the_list = numba_boost_up(results) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(df=df, ref_df=None, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f"finish record sina blocks:{category.value}")
def record(self, entity, start, end, size, timestamps, http_session): url = self.url.format( "{}".format(entity.code), level_flag(self.level), size, now_time_str(region=Region.CHN, fmt=TIME_FORMAT_DAY1)) text = sync_get(http_session, url, return_type='text') if text is None: return None results = json_callback_param(text) if results: klines = results['data']['klines'] @njit(nopython=True) def numba_boost_up(klines): kdatas = [] # TODO: ignore the last unfinished kdata now,could control it better if need for result in klines[:-1]: # "2000-01-28,1005.26,1012.56,1173.12,982.13,3023326,3075552000.00" # time,open,close,high,low,volume,turnover fields = result.split(',') kdatas.append( dict(timestamp=fields[0], open=to_float(fields[1]), close=to_float(fields[2]), high=to_float(fields[3]), low=to_float(fields[4]), volume=to_float(fields[5]), turnover=to_float(fields[6]))) return kdatas kdatas = numba_boost_up(klines) if len(kdatas) > 0: df = pd.DataFrame.from_records(kdatas) return df return None
def record(self, entity, start, end, size, timestamps, http_session): param = { 'url': self.generate_url(code='{}{}'.format(entity.exchange, entity.code), number=size), 'security_item': entity } url = param['url'] text = sync_get(http_session, url, return_type='text') if text is None: return None json_list = eval(text) if len(json_list) == 0: return None # @njit(nopython=True) def numba_boost_up(json_list): result_list = [] # {opendate:"2019-04-29",trade:"10.8700",changeratio:"-0.0431338",turnover:"74.924",netamount:"-2903349.8500", # ratioamount:"-0.155177",r0:"0.0000",r1:"2064153.0000",r2:"6485031.0000",r3:"10622169.2100",r0_net:"0.0000", # r1_net:"2064153.0000",r2_net:"-1463770.0000",r3_net:"-3503732.8500"} for item in json_list: result = { 'timestamp': to_pd_timestamp(item['opendate']), 'close': to_float(item['trade']), 'change_pct': to_float(item['changeratio']), 'turnover_rate': to_float(item['turnover']) / 10000, 'net_inflows': to_float(item['netamount']), 'net_inflow_rate': to_float(item['ratioamount']), # # 主力=超大单+大单 # net_main_inflows = Column(Float) # net_main_inflow_rate = Column(Float) # # 超大单 # net_huge_inflows = Column(Float) # net_huge_inflow_rate = Column(Float) # # 大单 # net_big_inflows = Column(Float) # net_big_inflow_rate = Column(Float) # # # 中单 # net_medium_inflows = Column(Float) # net_medium_inflow_rate = Column(Float) # # 小单 # net_small_inflows = Column(Float) # net_small_inflow_rate = Column(Float) 'net_main_inflows': to_float(item['r0_net']) + to_float(item['r1_net']), 'net_huge_inflows': to_float(item['r0_net']), 'net_big_inflows': to_float(item['r1_net']), 'net_medium_inflows': to_float(item['r2_net']), 'net_small_inflows': to_float(item['r3_net']), } amount = to_float(item['r0']) + to_float( item['r1']) + to_float(item['r2']) + to_float(item['r3']) if amount != 0: result['net_main_inflow_rate'] = (to_float( item['r0_net']) + to_float(item['r1_net'])) / amount result['net_huge_inflow_rate'] = to_float( item['r0_net']) / amount result['net_big_inflow_rate'] = to_float( item['r1_net']) / amount result['net_medium_inflow_rate'] = to_float( item['r2_net']) / amount result['net_small_inflow_rate'] = to_float( item['r3_net']) / amount result_list.append(result) return result_list result_list = numba_boost_up(json_list) df = pd.DataFrame.from_records(result_list) return df
def download_sz_etf_component(self, df: pd.DataFrame, http_session): query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml' self.parse_sz_etf_underlying_index(df) for _, etf in df.iterrows(): underlying_index = etf['拟合指数'] etf_code = etf['证券代码'] if len(underlying_index) == 0: self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...') continue url = query_url.format(underlying_index) text = sync_get(http_session, url, encoding='gbk', return_type='text') if text is None: continue try: dfs = pd.read_html(text, header=1) except ValueError as error: self.logger.error( f'HTML parse error: {error}, response: {text}') continue if len(dfs) < 4: continue response_df = dfs[3].copy() response_df = response_df.dropna(axis=1, how='any') response_df['品种代码'] = response_df['品种代码'].apply( lambda x: f'{x:06d}') etf_id = f'etf_sz_{etf_code}' response_df = response_df[['品种代码', '品种名称']].copy() response_df.rename(columns={ '品种代码': 'stock_code', '品种名称': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = 'sz' response_df['code'] = etf_code response_df['name'] = etf['证券简称'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') df_to_db(df=response_df, ref_df=None, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...') self.sleep()