コード例 #1
0
    def populate_sh_etf_type(self, df: pd.DataFrame, http_session):
        """
        填充沪市 ETF 代码对应的 TYPE 到列表数据中
        :param df: ETF 列表数据
        :return: 包含 ETF 对应 TYPE 的列表数据
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryETFNewAllInfo.do?' \
                    'isPagination=false&type={}&pageHelp.pageSize=25'

        type_df = pd.DataFrame()
        for etf_class in [1, 2]:
            url = query_url.format(etf_class)
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            response_dict = demjson.decode(text)
            response_df = pd.DataFrame(response_dict.get('result', []))
            response_df = response_df[['fundid1', 'etftype']]

            type_df = pd.concat([type_df, response_df])

        result_df = df.copy()
        result_df = result_df.sort_values(by='FUND_ID').reset_index(drop=True)
        type_df = type_df.sort_values(by='fundid1').reset_index(drop=True)

        result_df['ETF_TYPE'] = type_df['etftype']

        return result_df
コード例 #2
0
    async def download_sh_etf_component(self, df: pd.DataFrame, http_session,
                                        db_session):
        """
        ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF
                        5. 债券 ETF 6. 黄金 ETF
        :param df: ETF 列表数据
        :return: None
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \
                    'isPagination=false&type={}&etfClass={}'

        etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')]
        etf_df = self.populate_sh_etf_type(etf_df, http_session)

        for _, etf in etf_df.iterrows():
            url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS'])
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            try:
                response_dict = demjson.decode(text)
            except Exception as e:
                self.logger.error(
                    f'decode {url} failed with text: {text}, error as: {e}')
                continue

            response_df = pd.DataFrame(response_dict.get('result', []))
            etf_code = etf['FUND_ID']
            etf_id = f'etf_sh_{etf_code}'
            response_df = response_df[['instrumentId',
                                       'instrumentName']].copy()
            response_df.rename(columns={
                'instrumentId': 'stock_code',
                'instrumentName': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = ChnExchange.SSE.value
            response_df['code'] = etf_code
            response_df['name'] = etf['FUND_NAME']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')
コード例 #3
0
    async def run(self):
        http_session = get_sync_http_session()
        db_session = get_db_session(self.region, self.provider,
                                    self.data_schema)

        # 抓取沪市 ETF 列表
        url = 'http://query.sse.com.cn/commonQuery.do?sqlId=COMMON_SSE_ZQPZ_ETFLB_L_NEW'
        text = sync_get(http_session,
                        url,
                        headers=DEFAULT_SH_ETF_LIST_HEADER,
                        return_type='text')
        if text is None:
            return

        response_dict = demjson.decode(text)

        df = pd.DataFrame(response_dict.get('result', []))
        await self.persist_etf_list(df, ChnExchange.SSE.value, db_session)
        self.logger.info('沪市 ETF 列表抓取完成...')

        # 抓取沪市 ETF 成分股
        await self.download_sh_etf_component(df, http_session, db_session)
        self.logger.info('沪市 ETF 成分股抓取完成...')

        # 抓取深市 ETF 列表
        url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1945'
        content = sync_get(http_session, url, return_type='content')
        if content is None:
            return

        df = pd.read_excel(io.BytesIO(content), dtype=str)
        await self.persist_etf_list(df, ChnExchange.SZSE.value, db_session)
        self.logger.info('深市 ETF 列表抓取完成...')

        # 抓取深市 ETF 成分股
        await self.download_sz_etf_component(df, http_session, db_session)
        self.logger.info('深市 ETF 成分股抓取完成...')
コード例 #4
0
    async def fetch_cni_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取国证指数成分股
        """
        query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'

            try:
                response_df = response_df[['样本股代码']]
            except KeyError:
                response_df = response_df[['证券代码']]

            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df.columns = ['stock_code']
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
コード例 #5
0
    async def fetch_szse_index(self, http_session, db_session) -> None:
        """
        抓取深证指数列表
        """
        url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1812_zs&TABKEY=tab1'
        content = sync_get(http_session, url, return_type='content')
        if content is None:
            return

        df = pd.read_excel(io.BytesIO(content), dtype='str')
        df.columns = ['code', 'name', 'timestamp', 'base_point', 'list_date']
        df['category'] = 'szse'
        df = df.loc[df['code'].str.contains(r'^\d{6}$')]
        await self.persist_index(df)
        self.logger.info('深证指数列表抓取完成...')

        # 抓取深证指数成分股
        await self.fetch_szse_index_component(df, http_session, db_session)
        self.logger.info('深证指数成分股抓取完成...')
コード例 #6
0
    async def fetch_csi_index(self, http_session, db_session) -> None:
        """
        抓取上证、中证指数列表
        """
        url = 'http://www.csindex.com.cn/zh-CN/indices/index' \
              '?page={}&page_size={}&data_type=json&class_1=1&class_2=2&class_7=7&class_10=10'

        index_list = []
        page = 1
        page_size = 50

        while True:
            query_url = url.format(page, page_size)
            text = sync_get(http_session, query_url, return_type='text')
            if text is None:
                continue

            response_dict = demjson.decode(text)
            response_index_list = response_dict.get('list', [])

            if len(response_index_list) == 0:
                break

            index_list.extend(response_index_list)

            self.logger.info(f'上证、中证指数第 {page} 页抓取完成...')
            page += 1
            await self.sleep()

        df = pd.DataFrame(index_list)
        df = df[['base_date', 'base_point', 'index_code', 'indx_sname', 'online_date', 'class_eseries']].copy()
        df.columns = ['timestamp', 'base_point', 'code', 'name', 'list_date', 'class_eseries']
        df['category'] = df['class_eseries'].apply(lambda x: x.split(' ')[0].lower())
        df = df.drop('class_eseries', axis=1)
        df = df.loc[df['code'].str.contains(r'^\d{6}$')]

        await self.persist_index(df)
        self.logger.info('上证、中证指数列表抓取完成...')

        # 抓取上证、中证指数成分股
        await self.fetch_csi_index_component(df, http_session, db_session)
        self.logger.info('上证、中证指数成分股抓取完成...')
コード例 #7
0
    async def fetch_csi_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']
            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误')
                continue

            response_df = pd.read_excel(io.BytesIO(content))

            response_df = response_df[['成分券代码Constituent Code', '成分券名称Constituent Name']].rename(
                columns={'成分券代码Constituent Code': 'stock_code',
                         '成分券名称Constituent Name': 'stock_name'})

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
コード例 #8
0
    async def fetch_szse_index_component(self, df: pd.DataFrame, http_session, db_session):
        """
        抓取深证指数成分股
        """
        query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True)
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))

            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            await self.sleep()
コード例 #9
0
    async def fetch_cni_index(self, http_session) -> None:
        """
        抓取国证指数列表
        """
        url = 'http://www.cnindex.com.cn/zstx/jcxl/'
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return

        dfs = pd.read_html(text)

        # 第 9 个 table 之后为非股票指数
        dfs = dfs[1:9]

        result_df = pd.DataFrame()
        for df in dfs:
            header = df.iloc[0]
            df = df[1:]
            df.columns = header
            df.astype('str')

            result_df = pd.concat([result_df, df])

        result_df = result_df.drop('样本股数量', axis=1)
        result_df.columns = ['name', 'code', 'timestamp', 'base_point', 'list_date']
        result_df['timestamp'] = result_df['timestamp'].apply(lambda x: x.replace('-', ''))
        result_df['list_date'] = result_df['list_date'].apply(lambda x: x.replace('-', ''))
        result_df['category'] = 'csi'
        result_df = result_df.loc[result_df['code'].str.contains(r'^\d{6}$')]

        await self.persist_index(result_df)
        self.logger.info('国证指数列表抓取完成...')

        # 抓取国证指数成分股
        await self.fetch_cni_index_component(result_df, http_session)
        self.logger.info('国证指数成分股抓取完成...')
コード例 #10
0
    async def download_sz_etf_component(self, df: pd.DataFrame, http_session,
                                        db_session):
        query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml'

        self.parse_sz_etf_underlying_index(df)
        for _, etf in df.iterrows():
            underlying_index = etf['拟合指数']
            etf_code = etf['证券代码']

            if len(underlying_index) == 0:
                self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...')
                continue

            url = query_url.format(underlying_index)
            text = sync_get(http_session,
                            url,
                            encoding='gbk',
                            return_type='text')
            if text is None:
                continue

            try:
                dfs = pd.read_html(text, header=1)
            except ValueError as error:
                self.logger.error(
                    f'HTML parse error: {error}, response: {text}')
                continue

            if len(dfs) < 4:
                continue

            response_df = dfs[3].copy()
            response_df = response_df.dropna(axis=1, how='any')
            response_df['品种代码'] = response_df['品种代码'].apply(
                lambda x: f'{x:06d}')

            etf_id = f'etf_sz_{etf_code}'
            response_df = response_df[['品种代码', '品种名称']].copy()
            response_df.rename(columns={
                '品种代码': 'stock_code',
                '品种名称': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = ChnExchange.SZSE.value
            response_df['code'] = etf_code
            response_df['name'] = etf['证券简称']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            await df_to_db(region=self.region,
                           provider=self.provider,
                           data_schema=self.data_schema,
                           db_session=db_session,
                           df=response_df)
            self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')