Example #1
0
    def run(self):
        http_session = get_http_session(self.mode)

        # 抓取沪市 ETF 列表
        url = 'http://query.sse.com.cn/commonQuery.do?sqlId=COMMON_SSE_ZQPZ_ETFLB_L_NEW'
        text = sync_get(http_session,
                        url,
                        headers=DEFAULT_SH_ETF_LIST_HEADER,
                        return_type='text')
        if text is None:
            return

        response_dict = demjson.decode(text)

        df = pd.DataFrame(response_dict.get('result', []))
        self.persist_etf_list(df, exchange='sh')
        self.logger.info('沪市 ETF 列表抓取完成...')

        # 抓取沪市 ETF 成分股
        self.download_sh_etf_component(df, http_session)
        self.logger.info('沪市 ETF 成分股抓取完成...')

        # 抓取深市 ETF 列表
        url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1945'
        content = sync_get(http_session, url, return_type='content')
        if content is None:
            return

        df = pd.read_excel(io.BytesIO(content), dtype=str)
        self.persist_etf_list(df, exchange='sz')
        self.logger.info('深市 ETF 列表抓取完成...')

        # 抓取深市 ETF 成分股
        self.download_sz_etf_component(df, http_session)
        self.logger.info('深市 ETF 成分股抓取完成...')
Example #2
0
    def populate_sh_etf_type(self, df: pd.DataFrame, http_session):
        """
        填充沪市 ETF 代码对应的 TYPE 到列表数据中
        :param df: ETF 列表数据
        :return: 包含 ETF 对应 TYPE 的列表数据
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryETFNewAllInfo.do?' \
                    'isPagination=false&type={}&pageHelp.pageSize=25'

        type_df = pd.DataFrame()
        for etf_class in [1, 2]:
            url = query_url.format(etf_class)
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            response_dict = demjson.decode(text)
            response_df = pd.DataFrame(response_dict.get('result', []))
            response_df = response_df[['fundid1', 'etftype']]

            type_df = pd.concat([type_df, response_df])

        result_df = df.copy()
        result_df = result_df.sort_values(by='FUND_ID').reset_index(drop=True)
        type_df = type_df.sort_values(by='fundid1').reset_index(drop=True)

        result_df['ETF_TYPE'] = type_df['etftype']

        return result_df
Example #3
0
    def fetch_szse_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取深证指数成分股
        """
        query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True)
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))

            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
    def record(self, entity, start, end, size, timestamps, http_session):
        for page in range(1, 5):
            text = sync_get(http_session,
                            self.category_stocks_url.format(page, entity.code),
                            return_type='text')
            if text is None or text == 'null':
                break
            category_jsons = demjson.decode(text)

            # @njit(nopython=True)
            def numba_boost_up(category_jsons):
                the_list = []
                for category in category_jsons:
                    stock_code = category['code']
                    stock_id = china_stock_code_to_id(stock_code)
                    the_list.append({
                        'stock_id': stock_id,
                        'stock_code': stock_code,
                        'stock_name': category['name'],
                    })
                return the_list

            the_list = numba_boost_up(category_jsons)

            if the_list:
                return pd.DataFrame.from_records(the_list)
        return None
Example #5
0
    def record(self, entity, start, end, size, timestamps, http_session):
        # 此 url 不支持分页,如果超过我们想取的条数,则只能取最大条数
        if start is None or size > self.default_size:
            size = 8000

        param = {
            'security_item': entity,
            'level': self.level.value,
            'size': size
        }

        security_item = param['security_item']
        size = param['size']

        url = ChinaETFDayKdataRecorder.url.format(security_item.exchange,
                                                  security_item.code, size)
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return None

        response_json = demjson.decode(text)
        if response_json is None or len(response_json) == 0:
            return None

        df = pd.DataFrame(response_json)
        df['level'] = param['level']
        return df
Example #6
0
    def record(self, entity, start, end, size, timestamps, http_session):
        url = self.category_stocks_url.format(entity.code, '1')
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return None

        results = json_callback_param(text)

        # @njit(nopython=True)
        def numba_boost_up(results):
            the_list = []
            for result in results:
                items = result.split(',')
                stock_code = items[1]
                stock_id = china_stock_code_to_id(stock_code)
                the_list.append({
                    'stock_id': stock_id,
                    'stock_code': stock_code,
                    'stock_name': items[2],
                })
            return the_list

        the_list = numba_boost_up(results)
        if the_list:
            df = pd.DataFrame.from_records(the_list)
            return df

        self.sleep()
        return None
Example #7
0
    def refresh_token(self, http_session):
        resp = sync_get(http_session, self.GET_TOKEN_URL)
        self.logger.info("refresh_token resp.status_code:{}, resp.text:{}".format(resp.status_code, resp.text))

        if resp.status_code == 200 and resp.json() and 'access_token' in resp.json():
            self.token = resp.json()['access_token']
        else:
            self.logger.exception("could not refresh_token")
Example #8
0
    def download_sh_etf_component(self, df: pd.DataFrame, http_session):
        """
        ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF
                        5. 债券 ETF 6. 黄金 ETF
        :param df: ETF 列表数据
        :return: None
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \
                    'isPagination=false&type={}&etfClass={}'

        etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')]
        etf_df = self.populate_sh_etf_type(etf_df, http_session)

        for _, etf in etf_df.iterrows():
            url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS'])
            text = sync_get(http_session,
                            url,
                            headers=DEFAULT_SH_ETF_LIST_HEADER,
                            return_type='text')
            if text is None:
                continue
            response_dict = demjson.decode(text)
            response_df = pd.DataFrame(response_dict.get('result', []))

            etf_code = etf['FUND_ID']
            etf_id = f'etf_sh_{etf_code}'
            response_df = response_df[['instrumentId',
                                       'instrumentName']].copy()
            response_df.rename(columns={
                'instrumentId': 'stock_code',
                'instrumentName': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = 'sh'
            response_df['code'] = etf_code
            response_df['name'] = etf['FUND_NAME']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')

            self.sleep()
Example #9
0
    def record(self, entity, start, end, size, timestamps, http_session):
        the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN))
        if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]

        param = {
            'security_item': entity,
            'quarters': the_quarters,
            'level': self.level.value
        }

        security_item = param['security_item']
        quarters = param['quarters']
        level = param['level']

        result_df = pd.DataFrame()
        for year, quarter in quarters:
            query_url = self.url.format(security_item.code, year, quarter)
            text = sync_get(http_session,
                            query_url,
                            encoding='gbk',
                            return_type='text')
            if text is None:
                continue

            try:
                dfs = pd.read_html(text)
            except ValueError as error:
                self.logger.error(
                    f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})'
                )
                self.sleep()
                continue

            if len(dfs) < 5:
                self.sleep()
                continue

            df = dfs[4].copy()
            df = df.iloc[1:]
            df.columns = [
                'timestamp', 'open', 'high', 'close', 'low', 'volume',
                'turnover'
            ]
            result_df = pd.concat([result_df, df])

            self.sleep()

        if pd_is_not_null(result_df):
            result_df['level'] = level
            return result_df
        return None
Example #10
0
    def record(self, entity, start, end, size, timestamps, http_session):
        json_results = []
        for timestamp in timestamps:
            timestamp_str = to_time_str(timestamp)
            url = self.url.format(timestamp_str)
            text = sync_get(http_session,
                            url=url,
                            headers=DEFAULT_SH_SUMMARY_HEADER,
                            return_type='text')
            if text is None:
                continue

            results = demjson.decode(text[text.index("(") +
                                          1:text.index(")")])['result']
            result = [
                result for result in results if result['productType'] == '1'
            ]
            if result and len(result) == 1:
                result_json = result[0]
                # 有些较老的数据不存在,默认设为0.0
                json_results.append({
                    'timestamp':
                    timestamp,
                    'pe':
                    to_float(result_json['profitRate'], 0.0),
                    'total_value':
                    to_float(result_json['marketValue1'] + '亿', 0.0),
                    'total_tradable_vaule':
                    to_float(result_json['negotiableValue1'] + '亿', 0.0),
                    'volume':
                    to_float(result_json['trdVol1'] + '万', 0.0),
                    'turnover':
                    to_float(result_json['trdAmt1'] + '亿', 0.0),
                    'turnover_rate':
                    to_float(result_json['exchangeRate'], 0.0),
                })

                if len(json_results) > self.batch_size:
                    df = pd.DataFrame.from_records(json_results)
                    df['entity_id'] = entity.id
                    df['provider'] = Provider.Exchange.value
                    df['timestamp'] = pd.to_datetime(df['timestamp'])
                    df['name'] = '上证指数'
                    return df

        if len(json_results) > 0:

            df = pd.DataFrame.from_records(json_results)
            return df
        return None
Example #11
0
    def fetch_csi_index(self, http_session) -> None:
        """
        抓取上证、中证指数列表
        """
        url = 'http://www.csindex.com.cn/zh-CN/indices/index' \
              '?page={}&page_size={}&data_type=json&class_1=1&class_2=2&class_7=7&class_10=10'

        index_list = []
        page = 1
        page_size = 50

        while True:
            query_url = url.format(page, page_size)
            text = sync_get(http_session, query_url, return_type='text')
            if text is None:
                continue

            response_dict = demjson.decode(text)
            response_index_list = response_dict.get('list', [])

            if len(response_index_list) == 0:
                break

            index_list.extend(response_index_list)

            self.logger.info(f'上证、中证指数第 {page} 页抓取完成...')
            page += 1
            self.sleep()

        df = pd.DataFrame(index_list)
        df = df[[
            'base_date', 'base_point', 'index_code', 'indx_sname',
            'online_date', 'class_eseries'
        ]].copy()
        df.columns = [
            'timestamp', 'base_point', 'code', 'name', 'list_date',
            'class_eseries'
        ]
        df['category'] = df['class_eseries'].apply(
            lambda x: x.split(' ')[0].lower())
        df = df.drop('class_eseries', axis=1)
        df = df.loc[df['code'].str.contains(r'^\d{6}$')]

        self.persist_index(df)
        self.logger.info('上证、中证指数列表抓取完成...')

        # 抓取上证、中证指数成分股
        self.fetch_csi_index_component(df, http_session)
        self.logger.info('上证、中证指数成分股抓取完成...')
Example #12
0
    def fetch_csi_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']
            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误')
                continue

            response_df = pd.read_excel(io.BytesIO(content))

            response_df = response_df[[
                '成分券代码Constituent Code', '成分券名称Constituent Name'
            ]].rename(
                columns={
                    '成分券代码Constituent Code': 'stock_code',
                    '成分券名称Constituent Name': 'stock_name'
                })

            index_id = f'index_cn_{index_code}'
            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(self.region)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=self.region,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
    def record(self, entity, start, end, size, timestamps, http_session):
        url = self.generate_url(category=entity.category,
                                code=entity.code,
                                number=size)
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return None

        json_list = eval(text)
        if len(json_list) == 0:
            return None

        # @njit(nopython=True)
        def numba_boost_up(json_list):
            result_list = []
            for item in json_list:
                result_list.append({
                    'name':
                    entity.name,
                    'timestamp':
                    to_pd_timestamp(item['opendate']),
                    'close':
                    to_float(item['avg_price']),
                    'change_pct':
                    to_float(item['avg_changeratio']),
                    'turnover_rate':
                    to_float(item['turnover']) / 10000,
                    'net_inflows':
                    to_float(item['netamount']),
                    'net_inflow_rate':
                    to_float(item['ratioamount']),
                    'net_main_inflows':
                    to_float(item['r0_net']),
                    'net_main_inflow_rate':
                    to_float(item['r0_ratio'])
                })

            return result_list

        result_list = numba_boost_up(json_list)

        if len(result_list) > 0:
            df = pd.DataFrame.from_records(result_list)
            return df
        return None
Example #14
0
    def process_loop(self, entity, http_session):
        url = self.category_map_url.get(entity, None)
        if url is None:
            return

        resp = sync_get(http_session,
                        url,
                        encoding='GB2312',
                        headers=self.category_map_header[entity])
        if resp.status_code != 200:
            return

        df = self.format(resp=resp, exchange=entity)

        if pd_is_not_null(df):
            self.persist(df)

        return None
Example #15
0
    def process_loop(self, entity, http_session):
        url = 'https://api.nasdaq.com/api/screener/stocks'
        params = {'download': 'true', 'exchange': entity}
        resp = sync_get(http_session,
                        url,
                        headers=YAHOO_STOCK_LIST_HEADER,
                        params=params,
                        enable_proxy=False)
        if resp is None:
            return

        json = resp.json()['data']['rows']

        if len(json) > 0:
            df = self.format(content=json, exchange=entity)
            self.persist(df)

        return None
Example #16
0
    def fetch_cni_index_component(self, df: pd.DataFrame, http_session):
        """
        抓取国证指数成分股
        """
        query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            content = sync_get(http_session, url, return_type='content')
            if content is None:
                continue

            response_df = pd.read_excel(io.BytesIO(content), dtype='str')

            index_id = f'index_cn_{index_code}'

            try:
                response_df = response_df[['样本股代码']]
            except KeyError:
                response_df = response_df[['证券代码']]

            response_df['entity_id'] = index_id
            response_df['entity_type'] = EntityType.Index.value
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df.columns = ['stock_code']
            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=self.region,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Example #17
0
    def fetch_szse_index(self, http_session) -> None:
        """
        抓取深证指数列表
        """
        url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1812_zs&TABKEY=tab1'
        content = sync_get(http_session, url, return_type='content')
        if content is None:
            return

        df = pd.read_excel(io.BytesIO(content), dtype='str')
        df.columns = ['code', 'name', 'timestamp', 'base_point', 'list_date']
        df['category'] = 'szse'
        df = df.loc[df['code'].str.contains(r'^\d{6}$')]
        self.persist_index(df)
        self.logger.info('深证指数列表抓取完成...')

        # 抓取深证指数成分股
        self.fetch_szse_index_component(df, http_session)
        self.logger.info('深证指数成分股抓取完成...')
Example #18
0
    def fetch_cumulative_net_value(self, security_item, start, end,
                                   http_session) -> pd.DataFrame:
        query_url = 'http://api.fund.eastmoney.com/f10/lsjz?' \
                    'fundCode={}&pageIndex={}&pageSize=200&startDate={}&endDate={}'

        page = 1
        df = pd.DataFrame()
        while True:
            url = query_url.format(security_item.code, page,
                                   to_time_str(start), to_time_str(end))
            text = sync_get(http_session,
                            url,
                            headers=EASTMONEY_ETF_NET_VALUE_HEADER,
                            return_type='text')
            if text is None:
                break

            try:
                response_json = demjson.decode(text)
                response_df = pd.DataFrame(response_json['Data']['LSJZList'])
            except:
                break

            # 最后一页
            if not pd_is_not_null(response_df):
                break

            response_df['FSRQ'] = pd.to_datetime(response_df['FSRQ'])
            response_df['JZZZL'] = pd.to_numeric(response_df['JZZZL'],
                                                 errors='coerce')
            response_df['LJJZ'] = pd.to_numeric(response_df['LJJZ'],
                                                errors='coerce')
            response_df = response_df.fillna(0)
            response_df.set_index('FSRQ', inplace=True, drop=True)

            df = pd.concat([df, response_df])
            page += 1

            self.sleep()

        return df
Example #19
0
    def fetch_cni_index(self, http_session) -> None:
        """
        抓取国证指数列表
        """
        url = 'http://www.cnindex.com.cn/zstx/jcxl/'
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return

        dfs = pd.read_html(text)

        # 第 9 个 table 之后为非股票指数
        dfs = dfs[1:9]

        result_df = pd.DataFrame()
        for df in dfs:
            header = df.iloc[0]
            df = df[1:]
            df.columns = header
            df.astype('str')

            result_df = pd.concat([result_df, df])

        result_df = result_df.drop('样本股数量', axis=1)
        result_df.columns = [
            'name', 'code', 'timestamp', 'base_point', 'list_date'
        ]
        result_df['timestamp'] = result_df['timestamp'].apply(
            lambda x: x.replace('-', ''))
        result_df['list_date'] = result_df['list_date'].apply(
            lambda x: x.replace('-', ''))
        result_df['category'] = 'csi'
        result_df = result_df.loc[result_df['code'].str.contains(r'^\d{6}$')]

        self.persist_index(result_df)
        self.logger.info('国证指数列表抓取完成...')

        # 抓取国证指数成分股
        self.fetch_cni_index_component(result_df, http_session)
        self.logger.info('国证指数成分股抓取完成...')
    def process_loop(self, entity, http_session):
        text = sync_get(http_session,
                        self.category_map_url[entity],
                        encoding='gbk',
                        return_type='text')
        if text is None:
            return

        json_str = text[text.index('{'):text.index('}') + 1]
        tmp_json = json.loads(json_str)

        @njit(nopython=True)
        def numba_boost_up(tmp_json):
            the_list = []
            for code in tmp_json:
                name = tmp_json[code].split(',')[1]
                entity_id = f'block_cn_{code}'
                the_list.append({
                    'id': entity_id,
                    'entity_id': entity_id,
                    'entity_type': EntityType.Block.value,
                    'exchange': 'cn',
                    'code': code,
                    'name': name,
                    'category': entity.value
                })
            return the_list

        the_list = numba_boost_up(tmp_json)
        if the_list:
            df = pd.DataFrame.from_records(the_list)
            df_to_db(df=df,
                     ref_df=None,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)

        self.logger.info(f"finish record sina blocks:{entity.value}")
Example #21
0
    def process_loop(self, entity, http_session):
        category, url = entity

        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return

        results = json_callback_param(text)

        @njit(nopython=True)
        def numba_boost_up(results):
            the_list = []
            for result in results:
                items = result.split(',')
                code = items[1]
                name = items[2]
                entity_id = f'block_cn_{code}'
                the_list.append({
                    'id': entity_id,
                    'entity_id': entity_id,
                    'entity_type': EntityType.Block.value,
                    'exchange': 'cn',
                    'code': code,
                    'name': name,
                    'category': category.value
                })
            return the_list

        the_list = numba_boost_up(results)
        if the_list:
            df = pd.DataFrame.from_records(the_list)
            df_to_db(df=df,
                     ref_df=None,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
        self.logger.info(f"finish record sina blocks:{category.value}")
    def record(self, entity, start, end, size, timestamps, http_session):
        url = self.url.format(
            "{}".format(entity.code), level_flag(self.level), size,
            now_time_str(region=Region.CHN, fmt=TIME_FORMAT_DAY1))
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return None

        results = json_callback_param(text)

        if results:
            klines = results['data']['klines']

            @njit(nopython=True)
            def numba_boost_up(klines):
                kdatas = []
                # TODO: ignore the last unfinished kdata now,could control it better if need
                for result in klines[:-1]:
                    # "2000-01-28,1005.26,1012.56,1173.12,982.13,3023326,3075552000.00"
                    # time,open,close,high,low,volume,turnover
                    fields = result.split(',')
                    kdatas.append(
                        dict(timestamp=fields[0],
                             open=to_float(fields[1]),
                             close=to_float(fields[2]),
                             high=to_float(fields[3]),
                             low=to_float(fields[4]),
                             volume=to_float(fields[5]),
                             turnover=to_float(fields[6])))
                return kdatas

            kdatas = numba_boost_up(klines)
            if len(kdatas) > 0:
                df = pd.DataFrame.from_records(kdatas)
                return df
        return None
    def record(self, entity, start, end, size, timestamps, http_session):
        param = {
            'url':
            self.generate_url(code='{}{}'.format(entity.exchange, entity.code),
                              number=size),
            'security_item':
            entity
        }
        url = param['url']
        text = sync_get(http_session, url, return_type='text')
        if text is None:
            return None

        json_list = eval(text)
        if len(json_list) == 0:
            return None

        # @njit(nopython=True)
        def numba_boost_up(json_list):
            result_list = []
            # {opendate:"2019-04-29",trade:"10.8700",changeratio:"-0.0431338",turnover:"74.924",netamount:"-2903349.8500",
            # ratioamount:"-0.155177",r0:"0.0000",r1:"2064153.0000",r2:"6485031.0000",r3:"10622169.2100",r0_net:"0.0000",
            # r1_net:"2064153.0000",r2_net:"-1463770.0000",r3_net:"-3503732.8500"}
            for item in json_list:
                result = {
                    'timestamp':
                    to_pd_timestamp(item['opendate']),
                    'close':
                    to_float(item['trade']),
                    'change_pct':
                    to_float(item['changeratio']),
                    'turnover_rate':
                    to_float(item['turnover']) / 10000,
                    'net_inflows':
                    to_float(item['netamount']),
                    'net_inflow_rate':
                    to_float(item['ratioamount']),
                    #     # 主力=超大单+大单
                    #     net_main_inflows = Column(Float)
                    #     net_main_inflow_rate = Column(Float)
                    #     # 超大单
                    #     net_huge_inflows = Column(Float)
                    #     net_huge_inflow_rate = Column(Float)
                    #     # 大单
                    #     net_big_inflows = Column(Float)
                    #     net_big_inflow_rate = Column(Float)
                    #
                    #     # 中单
                    #     net_medium_inflows = Column(Float)
                    #     net_medium_inflow_rate = Column(Float)
                    #     # 小单
                    #     net_small_inflows = Column(Float)
                    #     net_small_inflow_rate = Column(Float)
                    'net_main_inflows':
                    to_float(item['r0_net']) + to_float(item['r1_net']),
                    'net_huge_inflows':
                    to_float(item['r0_net']),
                    'net_big_inflows':
                    to_float(item['r1_net']),
                    'net_medium_inflows':
                    to_float(item['r2_net']),
                    'net_small_inflows':
                    to_float(item['r3_net']),
                }

                amount = to_float(item['r0']) + to_float(
                    item['r1']) + to_float(item['r2']) + to_float(item['r3'])
                if amount != 0:
                    result['net_main_inflow_rate'] = (to_float(
                        item['r0_net']) + to_float(item['r1_net'])) / amount
                    result['net_huge_inflow_rate'] = to_float(
                        item['r0_net']) / amount
                    result['net_big_inflow_rate'] = to_float(
                        item['r1_net']) / amount
                    result['net_medium_inflow_rate'] = to_float(
                        item['r2_net']) / amount
                    result['net_small_inflow_rate'] = to_float(
                        item['r3_net']) / amount

                result_list.append(result)

            return result_list

        result_list = numba_boost_up(json_list)
        df = pd.DataFrame.from_records(result_list)
        return df
Example #24
0
    def download_sz_etf_component(self, df: pd.DataFrame, http_session):
        query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml'

        self.parse_sz_etf_underlying_index(df)
        for _, etf in df.iterrows():
            underlying_index = etf['拟合指数']
            etf_code = etf['证券代码']

            if len(underlying_index) == 0:
                self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...')
                continue

            url = query_url.format(underlying_index)
            text = sync_get(http_session,
                            url,
                            encoding='gbk',
                            return_type='text')
            if text is None:
                continue

            try:
                dfs = pd.read_html(text, header=1)
            except ValueError as error:
                self.logger.error(
                    f'HTML parse error: {error}, response: {text}')
                continue

            if len(dfs) < 4:
                continue

            response_df = dfs[3].copy()
            response_df = response_df.dropna(axis=1, how='any')
            response_df['品种代码'] = response_df['品种代码'].apply(
                lambda x: f'{x:06d}')

            etf_id = f'etf_sz_{etf_code}'
            response_df = response_df[['品种代码', '品种名称']].copy()
            response_df.rename(columns={
                '品种代码': 'stock_code',
                '品种名称': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = EntityType.ETF.value
            response_df['exchange'] = 'sz'
            response_df['code'] = etf_code
            response_df['name'] = etf['证券简称']
            response_df['timestamp'] = now_pd_timestamp(Region.CHN)

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            df_to_db(df=response_df,
                     ref_df=None,
                     region=Region.CHN,
                     data_schema=self.data_schema,
                     provider=self.provider)
            self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')

            self.sleep()