Esempio n. 1
0
    def fetch_szse_index_component(self, df: pd.DataFrame):
        """
        抓取深证指数成分股
        """
        query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)
            response = requests.get(url)

            response_df = pd.read_excel(io.BytesIO(response.content),
                                        dtype='str')

            index_id = f'index_cn_{index_code}'
            response_df = response_df[['证券代码']]
            response_df['id'] = response_df['证券代码'].apply(
                lambda x: f'{index_id}_{china_stock_code_to_id(str(x))}')
            response_df['entity_id'] = response_df['id']
            response_df['stock_id'] = response_df['证券代码'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['index_id'] = index_id
            response_df.drop('证券代码', axis=1, inplace=True)

            df_to_db(data_schema=self.data_schema,
                     df=response_df,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
    def record(self, entity, start, end, size, timestamps):
        for page in range(1, 5):
            resp = requests.get(self.category_stocks_url.format(page, entity.code))
            try:
                if resp.text == 'null' or resp.text is None:
                    break
                category_jsons = demjson.decode(resp.text)
                the_list = []
                for category in category_jsons:
                    stock_code = category['code']
                    stock_id = china_stock_code_to_id(stock_code)
                    block_id = entity.id
                    the_list.append({
                        'id': '{}_{}'.format(block_id, stock_id),
                        'entity_id': block_id,
                        'entity_type': 'block',
                        'exchange': entity.exchange,
                        'code': entity.code,
                        'name': entity.name,
                        'timestamp': now_pd_timestamp(),
                        'stock_id': stock_id,
                        'stock_code': stock_code,
                        'stock_name': category['name'],

                    })
                if the_list:
                    df = pd.DataFrame.from_records(the_list)
                    df_to_db(data_schema=self.data_schema, df=df, provider=self.provider,
                             force_update=True)

                self.logger.info('finish recording BlockStock:{},{}'.format(entity.category, entity.name))

            except Exception as e:
                self.logger.error("error:,resp.text:", e, resp.text)
            self.sleep()
    def run(self):
        for category, url in self.category_map_url.items():
            resp = requests.get(url)
            results = json_callback_param(resp.text)
            for result in results:
                items = result.split(',')
                code = items[1]
                name = items[2]
                id = 'index_cn_{}'.format(code)
                if id in self.index_ids:
                    continue
                self.session.add(
                    Index(id=id,
                          entity_id=id,
                          entity_type='index',
                          exchange='cn',
                          code=code,
                          name=name,
                          category=category.value))
            self.session.commit()

        indices = get_entities(session=self.session,
                               entity_type='index',
                               return_type='domain',
                               filters=[
                                   Index.category.in_([
                                       StockCategory.concept.value,
                                       StockCategory.industry.value
                                   ])
                               ],
                               provider=self.provider)

        for index_item in indices:
            resp = requests.get(
                self.category_stocks_url.format(index_item.code, '1'))
            try:
                results = json_callback_param(resp.text)
                the_list = []
                for result in results:
                    items = result.split(',')
                    stock_code = items[1]
                    stock_id = china_stock_code_to_id(stock_code)
                    index_id = index_item.id
                    the_list.append({
                        'id': '{}_{}'.format(index_id, stock_id),
                        'index_id': index_id,
                        'stock_id': stock_id
                    })
                if the_list:
                    df = pd.DataFrame.from_records(the_list)
                    df_to_db(data_schema=self.data_schema,
                             df=df,
                             provider=self.provider)

                self.logger.info('finish recording index:{},{}'.format(
                    index_item.category, index_item.name))

            except Exception as e:
                self.logger.error("error:,resp.text:", e, resp.text)
            self.sleep()
Esempio n. 4
0
    def fetch_csi_index_component(self, df: pd.DataFrame):
        """
        抓取上证、中证指数成分股
        """
        query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)

            try:
                response = requests.get(url)
                response.raise_for_status()
            except requests.HTTPError as error:
                self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误 ({error})')
                continue

            response_df = pd.read_excel(io.BytesIO(response.content))

            index_id = f'index_cn_{index_code}'
            response_df = response_df[['成分券代码Constituent Code']].rename(columns={'成分券代码Constituent Code': 'stock_code'})
            response_df['id'] = response_df['stock_code'].apply(lambda x: f'{index_id}_{china_stock_code_to_id(str(x))}')
            response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x)))
            response_df['index_id'] = index_id
            response_df.drop('stock_code', axis=1, inplace=True)

            df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Esempio n. 5
0
    def record(self, entity, start, end, size, timestamps):
        q = query(finance.FUND_PORTFOLIO_STOCK).filter(finance.FUND_PORTFOLIO_STOCK.pub_date >= start).filter(
            finance.FUND_PORTFOLIO_STOCK.code == entity.code)
        df = finance.run_query(q)
        if pd_is_not_null(df):
            #          id    code period_start  period_end    pub_date  report_type_id report_type  rank  symbol  name      shares    market_cap  proportion
            # 0   8640569  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     1  601318  中国平安  19869239.0  1.361043e+09        7.09
            # 1   8640570  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     2  600519  贵州茅台    921670.0  6.728191e+08        3.50
            # 2   8640571  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     3  600036  招商银行  18918815.0  5.806184e+08        3.02
            # 3   8640572  159919   2018-07-01  2018-09-30  2018-10-26          403003        第三季度     4  601166  兴业银行  22862332.0  3.646542e+08        1.90
            df['timestamp'] = pd.to_datetime(df['pub_date'])

            df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True)
            df['proportion'] = df['proportion'] * 0.01

            df = portfolio_relate_stock(df, entity)

            df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x))
            df['id'] = df[['entity_id', 'stock_id', 'pub_date', 'id']].apply(lambda x: '_'.join(x.astype(str)), axis=1)
            df['report_date'] = pd.to_datetime(df['period_end'])
            df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x))

            df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update)

            # self.logger.info(df.tail())
            self.logger.info(f"persist etf {entity.code} portfolio success")

        return None
Esempio n. 6
0
    def download_sh_etf_component(self, df: pd.DataFrame):
        """
        ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF
                        5. 债券 ETF 6. 黄金 ETF
        :param df: ETF 列表数据
        :return: None
        """
        query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \
                    'isPagination=false&type={}&etfClass={}'

        etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')]
        etf_df = self.populate_sh_etf_type(etf_df)

        for _, etf in etf_df.iterrows():
            url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS'])
            response = requests.get(url, headers=DEFAULT_SH_ETF_LIST_HEADER)
            response_dict = demjson.decode(response.text)
            response_df = pd.DataFrame(response_dict.get('result', []))

            etf_code = etf['FUND_ID']
            index_id = f'index_sh_{etf_code}'
            response_df = response_df[['instrumentId']]
            response_df['id'] = response_df['instrumentId'].apply(
                lambda code: f'{index_id}_{china_stock_code_to_id(code)}')
            response_df['stock_id'] = response_df['instrumentId'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['index_id'] = index_id
            response_df.drop('instrumentId', axis=1, inplace=True)

            df_to_db(data_schema=self.data_schema,
                     df=response_df,
                     provider=self.provider)
            self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...')

            self.sleep()
Esempio n. 7
0
    def download_sz_etf_component(self, df: pd.DataFrame):
        query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml'

        self.parse_sz_etf_underlying_index(df)
        for _, etf in df.iterrows():
            underlying_index = etf['拟合指数']
            etf_code = etf['证券代码']

            if len(underlying_index) == 0:
                self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...')
                continue

            url = query_url.format(underlying_index)
            response = requests.get(url)
            response.encoding = 'gbk'

            try:
                dfs = pd.read_html(response.text, header=1)
            except ValueError as error:
                self.logger.error(
                    f'HTML parse error: {error}, response: {response.text}')
                continue

            if len(dfs) < 4:
                continue

            response_df = dfs[3].copy()
            response_df = response_df.dropna(axis=1, how='any')
            response_df['品种代码'] = response_df['品种代码'].apply(
                lambda x: f'{x:06d}')

            etf_id = f'etf_sz_{etf_code}'
            response_df = response_df[['品种代码', '品种名称']].copy()
            response_df.rename(columns={
                '品种代码': 'stock_code',
                '品种名称': 'stock_name'
            },
                               inplace=True)

            response_df['entity_id'] = etf_id
            response_df['entity_type'] = 'etf'
            response_df['exchange'] = 'sz'
            response_df['code'] = etf_code
            response_df['name'] = etf['证券简称']
            response_df['timestamp'] = now_pd_timestamp()

            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda code: china_stock_code_to_id(code))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{etf_id}_{x}')

            df_to_db(data_schema=self.data_schema,
                     df=response_df,
                     provider=self.provider)
            self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...')

            self.sleep()
Esempio n. 8
0
    def run(self):
        # get stock category from sina
        for category, url in self.category_map_url.items():
            resp = requests.get(url)
            resp.encoding = 'GBK'

            tmp_str = resp.text
            json_str = tmp_str[tmp_str.index('{'):tmp_str.index('}') + 1]
            tmp_json = json.loads(json_str)
            for code in tmp_json:
                name = tmp_json[code].split(',')[1]
                id = 'index_cn_{}'.format(code)
                if id in self.index_ids:
                    continue
                self.session.add(Index(id=id, entity_type='index', exchange='cn', code=code, name=name,
                                       category=category.value))
            self.session.commit()

        indices = get_entities(session=self.session, entity_type='index',
                               return_type='domain', filters=[
                Index.category.in_([StockCategory.industry.value, StockCategory.concept.value])],
                               provider=self.provider)

        for index_item in indices:
            for page in range(1, 5):
                resp = requests.get(self.category_stocks_url.format(page, index_item.code))
                try:
                    if resp.text == 'null' or resp.text is None:
                        break
                    category_jsons = demjson.decode(resp.text)
                    the_list = []
                    for category in category_jsons:
                        stock_code = category['code']
                        stock_id = china_stock_code_to_id(stock_code)
                        index_id = index_item.id
                        the_list.append({
                            'id': '{}_{}'.format(index_id, stock_id),
                            'index_id': index_id,
                            'stock_id': stock_id
                        })
                    if the_list:
                        df = pd.DataFrame.from_records(the_list)
                        df_to_db(data_schema=self.data_schema, df=df, provider=self.provider)

                    self.logger.info('finish recording index:{},{}'.format(index_item.category, index_item.name))

                except Exception as e:
                    self.logger.error("error:,resp.text:", e, resp.text)
                self.sleep()
Esempio n. 9
0
    def fetch_cni_index_component(self, df: pd.DataFrame):
        """
        抓取国证指数成分股
        """
        query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls'

        for _, index in df.iterrows():
            index_code = index['code']

            url = query_url.format(index_code)

            try:
                response = requests.get(url)
                response.raise_for_status()
            except requests.HTTPError as error:
                self.logger.error(
                    f'{index["name"]} - {index_code} 成分股抓取错误 ({error})')
                continue

            response_df = pd.read_excel(io.BytesIO(response.content),
                                        dtype='str')

            index_id = f'index_cn_{index_code}'

            try:
                response_df = response_df[['样本股代码']]
            except KeyError:
                response_df = response_df[['证券代码']]

            response_df['entity_id'] = index_id
            response_df['entity_type'] = 'index'
            response_df['exchange'] = 'cn'
            response_df['code'] = index_code
            response_df['name'] = index['name']
            response_df['timestamp'] = now_pd_timestamp()

            response_df.columns = ['stock_code']
            response_df['stock_id'] = response_df['stock_code'].apply(
                lambda x: china_stock_code_to_id(str(x)))
            response_df['id'] = response_df['stock_id'].apply(
                lambda x: f'{index_id}_{x}')

            df_to_db(data_schema=self.data_schema,
                     df=response_df,
                     provider=self.provider)
            self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...')

            self.sleep()
Esempio n. 10
0
    def record(self, entity, start, end, size, timestamps):
        resp = requests.get(self.category_stocks_url.format(entity.code, '1'))
        try:
            results = json_callback_param(resp.text)
            the_list = []
            for result in results:
                items = result.split(',')
                stock_code = items[1]
                stock_id = china_stock_code_to_id(stock_code)
                block_id = entity.id

                the_list.append({
                    'id': '{}_{}'.format(block_id, stock_id),
                    'entity_id': block_id,
                    'entity_type': 'block',
                    'exchange': entity.exchange,
                    'code': entity.code,
                    'name': entity.name,
                    'timestamp': now_pd_timestamp(),
                    'stock_id': stock_id,
                    'stock_code': stock_code,
                    'stock_name': items[2],
                })
            if the_list:
                df = pd.DataFrame.from_records(the_list)
                df_to_db(data_schema=self.data_schema,
                         df=df,
                         provider=self.provider,
                         force_update=True)

            self.logger.info('finish recording block:{},{}'.format(
                entity.category, entity.name))

        except Exception as e:
            self.logger.error("error:,resp.text:", e, resp.text)
        self.sleep()