def record(self, entity, start, end, size, timestamps): q = query(finance.FUND_PORTFOLIO_STOCK).filter(finance.FUND_PORTFOLIO_STOCK.pub_date >= start).filter( finance.FUND_PORTFOLIO_STOCK.code == entity.code) df = finance.run_query(q) if pd_is_not_null(df): # id code period_start period_end pub_date report_type_id report_type rank symbol name shares market_cap proportion # 0 8640569 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 1 601318 中国平安 19869239.0 1.361043e+09 7.09 # 1 8640570 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 2 600519 贵州茅台 921670.0 6.728191e+08 3.50 # 2 8640571 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 3 600036 招商银行 18918815.0 5.806184e+08 3.02 # 3 8640572 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 4 601166 兴业银行 22862332.0 3.646542e+08 1.90 df['timestamp'] = pd.to_datetime(df['pub_date']) df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True) df['proportion'] = df['proportion'] * 0.01 df = portfolio_relate_stock(df, entity) df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x)) df['id'] = df[['entity_id', 'stock_id', 'pub_date', 'id']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df['report_date'] = pd.to_datetime(df['period_end']) df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x)) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) # self.logger.info(df.tail()) self.logger.info(f"persist etf {entity.code} portfolio success") return None
def record(self, entity, start, end, size, timestamps): for page in range(1, 5): resp = requests.get(self.category_stocks_url.format(page, entity.code)) try: if resp.text == 'null' or resp.text is None: break category_jsons = demjson.decode(resp.text) the_list = [] for category in category_jsons: stock_code = category['code'] stock_id = china_stock_code_to_id(stock_code) block_id = entity.id the_list.append({ 'id': '{}_{}'.format(block_id, stock_id), 'entity_id': block_id, 'entity_type': 'block', 'exchange': entity.exchange, 'code': entity.code, 'name': entity.name, 'timestamp': now_pd_timestamp(), 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': category['name'], }) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(data_schema=self.data_schema, df=df, provider=self.provider, force_update=True) self.logger.info('finish recording BlockStock:{},{}'.format(entity.category, entity.name)) except Exception as e: self.logger.error("error:,resp.text:", e, resp.text) self.sleep()
def record(self, entity, start, end, size, timestamps): resp = requests.get(self.category_stocks_url.format(entity.code, '1')) try: results = json_callback_param(resp.text) the_list = [] for result in results: items = result.split(',') stock_code = items[1] stock_id = china_stock_code_to_id(stock_code) block_id = entity.id the_list.append({ 'id': '{}_{}'.format(block_id, stock_id), 'entity_id': block_id, 'entity_type': 'block', 'exchange': entity.exchange, 'code': entity.code, 'name': entity.name, 'timestamp': now_pd_timestamp(), 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': items[2], }) if the_list: df = pd.DataFrame.from_records(the_list) df_to_db(data_schema=self.data_schema, df=df, provider=self.provider, force_update=True) self.logger.info('finish recording block:{},{}'.format(entity.category, entity.name)) except Exception as e: self.logger.error("error:,resp.text:", e, resp.text) self.sleep()
def fetch_csi_index_component(self): """ 抓取上证、中证指数成分股 """ for _, index in self.all_index.iterrows(): response_df = pd.DataFrame() index_code = index.name.split(".")[0] stocks = get_index_stocks(index.name) response_df['stock_code'] = stocks response_df['stock_code'] = response_df['stock_code'].apply( lambda x: x.split(".")[0]) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = 'index' response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index.display_name response_df['timestamp'] = now_pd_timestamp() response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(data_schema=self.data_schema, df=response_df, provider=self.provider, force_update=True) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def format(self, entity, df): # id code period_start period_end pub_date report_type_id report_type rank symbol name shares market_cap proportion # 0 8640569 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 1 601318 中国平安 19869239.0 1.361043e+09 7.09 # 1 8640570 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 2 600519 贵州茅台 921670.0 6.728191e+08 3.50 # 2 8640571 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 3 600036 招商银行 18918815.0 5.806184e+08 3.02 # 3 8640572 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 4 601166 兴业银行 22862332.0 3.646542e+08 1.90 if 'timestamp' not in df.columns: df['timestamp'] = pd.to_datetime(df[self.get_original_time_field()]) elif not isinstance(df['timestamp'].dtypes, datetime): df['timestamp'] = pd.to_datetime(df['timestamp']) df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True) df['proportion'] *= 0.01 df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x)) df['report_date'] = pd.to_datetime(df['period_end']) df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x)) df['entity_id'] = entity.id df['provider'] = self.provider.value df['entity_type'] = entity.entity_type df['exchange'] = entity.exchange df['code'] = entity.code df['name'] = entity.name df['id'] = self.generate_domain_id(entity, df) return df
def fetch_szse_index_component(self, df: pd.DataFrame, http_session): """ 抓取深证指数成分股 """ query_url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1747_zs&TABKEY=tab1&ZSDM={}' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: continue response_df = pd.read_excel(io.BytesIO(content), dtype='str') index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df.rename(columns={'证券代码': 'stock_code', '证券简称': 'stock_name'}, inplace=True) response_df['stock_id'] = response_df['stock_code'].apply(lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def download_sz_etf_component(self, df: pd.DataFrame, http_session): query_url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vII_NewestComponent/indexid/{}.phtml' self.parse_sz_etf_underlying_index(df) for _, etf in df.iterrows(): underlying_index = etf['拟合指数'] etf_code = etf['证券代码'] if len(underlying_index) == 0: self.logger.info(f'{etf["证券简称"]} - {etf_code} 非 A 股市场指数,跳过...') continue url = query_url.format(underlying_index) response = request_get(http_session, url) response.encoding = 'gbk' try: dfs = pd.read_html(response.text, header=1) except ValueError as error: self.logger.error( f'HTML parse error: {error}, response: {response.text}') continue if len(dfs) < 4: continue response_df = dfs[3].copy() response_df = response_df.dropna(axis=1, how='any') response_df['品种代码'] = response_df['品种代码'].apply( lambda x: f'{x:06d}') etf_id = f'etf_sz_{etf_code}' response_df = response_df[['品种代码', '品种名称']].copy() response_df.rename(columns={ '品种代码': 'stock_code', '品种名称': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = 'sz' response_df['code'] = etf_code response_df['name'] = etf['证券简称'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') df_to_db(df=response_df, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{etf["证券简称"]} - {etf_code} 成分股抓取完成...') self.sleep()
def download_sh_etf_component(self, df: pd.DataFrame, http_session): """ ETF_CLASS => 1. 单市场 ETF 2.跨市场 ETF 3. 跨境 ETF 5. 债券 ETF 6. 黄金 ETF :param df: ETF 列表数据 :return: None """ query_url = 'http://query.sse.com.cn/infodisplay/queryConstituentStockInfo.do?' \ 'isPagination=false&type={}&etfClass={}' etf_df = df[(df['ETF_CLASS'] == '1') | (df['ETF_CLASS'] == '2')] etf_df = self.populate_sh_etf_type(etf_df, http_session) for _, etf in etf_df.iterrows(): url = query_url.format(etf['ETF_TYPE'], etf['ETF_CLASS']) text = sync_get(http_session, url, headers=DEFAULT_SH_ETF_LIST_HEADER, return_type='text') if text is None: continue response_dict = demjson.decode(text) response_df = pd.DataFrame(response_dict.get('result', [])) etf_code = etf['FUND_ID'] etf_id = f'etf_sh_{etf_code}' response_df = response_df[['instrumentId', 'instrumentName']].copy() response_df.rename(columns={ 'instrumentId': 'stock_code', 'instrumentName': 'stock_name' }, inplace=True) response_df['entity_id'] = etf_id response_df['entity_type'] = EntityType.ETF.value response_df['exchange'] = 'sh' response_df['code'] = etf_code response_df['name'] = etf['FUND_NAME'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df['stock_id'] = response_df['stock_code'].apply( lambda code: china_stock_code_to_id(code)) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{etf_id}_{x}') df_to_db(df=response_df, ref_df=None, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{etf["FUND_NAME"]} - {etf_code} 成分股抓取完成...') self.sleep()
def numba_boost_up(category_jsons): the_list = [] for category in category_jsons: stock_code = category['code'] stock_id = china_stock_code_to_id(stock_code) the_list.append({ 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': category['name'], }) return the_list
def numba_boost_up(results): the_list = [] for result in results: items = result.split(',') stock_code = items[1] stock_id = china_stock_code_to_id(stock_code) the_list.append({ 'stock_id': stock_id, 'stock_code': stock_code, 'stock_name': items[2], }) return the_list
def fetch_csi_index_component(self, df: pd.DataFrame, http_session): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) try: response = request_get(http_session, url) response.raise_for_status() except requests.HTTPError as error: self.logger.error( f'{index["name"]} - {index_code} 成分股抓取错误 ({error})') continue response_df = pd.read_excel(io.BytesIO(response.content)) response_df = response_df[[ '成分券代码Constituent Code', '成分券名称Constituent Name' ]].rename( columns={ '成分券代码Constituent Code': 'stock_code', '成分券名称Constituent Name': 'stock_name' }) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, region=Region.CHN, data_schema=self.data_schema, provider=self.provider, force_update=True) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def fetch_cni_index_component(self, df: pd.DataFrame, http_session): """ 抓取国证指数成分股 """ query_url = 'http://www.cnindex.com.cn/docs/yb_{}.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) try: response = request_get(http_session, url) response.raise_for_status() except requests.HTTPError as error: self.logger.error( f'{index["name"]} - {index_code} 成分股抓取错误 ({error})') continue response_df = pd.read_excel(io.BytesIO(response.content), dtype='str') index_id = f'index_cn_{index_code}' try: response_df = response_df[['样本股代码']] except KeyError: response_df = response_df[['证券代码']] response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(Region.CHN) response_df.columns = ['stock_code'] response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, region=Region.CHN, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def fetch_csi_index_component(self, df: pd.DataFrame, http_session): """ 抓取上证、中证指数成分股 """ query_url = 'http://www.csindex.com.cn/uploads/file/autofile/cons/{}cons.xls' for _, index in df.iterrows(): index_code = index['code'] url = query_url.format(index_code) content = sync_get(http_session, url, return_type='content') if content is None: self.logger.error(f'{index["name"]} - {index_code} 成分股抓取错误') continue response_df = pd.read_excel(io.BytesIO(content)) response_df = response_df[[ '成分券代码Constituent Code', '成分券名称Constituent Name' ]].rename( columns={ '成分券代码Constituent Code': 'stock_code', '成分券名称Constituent Name': 'stock_name' }) index_id = f'index_cn_{index_code}' response_df['entity_id'] = index_id response_df['entity_type'] = EntityType.Index.value response_df['exchange'] = 'cn' response_df['code'] = index_code response_df['name'] = index['name'] response_df['timestamp'] = now_pd_timestamp(self.region) response_df['stock_id'] = response_df['stock_code'].apply( lambda x: china_stock_code_to_id(str(x))) response_df['id'] = response_df['stock_id'].apply( lambda x: f'{index_id}_{x}') df_to_db(df=response_df, ref_df=None, region=self.region, data_schema=self.data_schema, provider=self.provider) self.logger.info(f'{index["name"]} - {index_code} 成分股抓取完成...') self.sleep()
def record(self, entity, start, end, size, timestamps): if start < to_pd_timestamp("2008-01-01"): start = to_pd_timestamp("2008-01-01") # 获取数据库中已有数据 data_schema_df = self.data_schema.query_data(entity_id=entity.id) if not data_schema_df.empty and data_schema_df.timestamp.max( ) <= start: from datetime import timedelta # bdate_range_date = pd.bdate_range(start, start + timedelta(weeks=1)) bdate_range_date = pd.bdate_range(start, start + timedelta(weeks=4 * 6)) if bdate_range_date[ bdate_range_date > start][-1] > now_pd_timestamp(): start = to_time_str(now_pd_timestamp()) else: start = to_time_str( bdate_range_date[bdate_range_date > start][-1]) data = get_index_stocks(to_jq_entity_id(entity), date=start) df = pd.DataFrame(data, columns=['code']) if pd_is_not_null(df): df['stock_code'] = df['code'].apply(lambda x: str(x).split('.')[0]) df['stock_exchange'] = df['code'].apply( lambda x: str(x).split('.')[1]) df['stock_exchange'] = df['stock_exchange'].replace( 'XSHG', 'sh').replace('XSHE', 'sz') df['stock_id'] = df['stock_code'].apply( lambda x: china_stock_code_to_id(x)) df['stock_name'] = df.stock_id.apply( lambda x: StockDetail.query_data(entity_id=x).name[0]) df_old = df[df.stock_id.isin(data_schema_df.stock_id)].copy() df_new = df[~df.stock_id.isin(data_schema_df.stock_id)].copy() out_data = data_schema_df[~data_schema_df.stock_id.isin(df.stock_id )].copy() if df_new.empty and out_data.empty: # 没有调入的股票 没有调出的股票 data_schema_df['timestamp'] = pd.to_datetime(start) data_schema_df['pub_date'] = data_schema_df['timestamp'] df_to_db(df=data_schema_df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None elif df_new.empty and not out_data.empty: # 没有调入,有调出 data_schema_df['timestamp'] = pd.to_datetime(start) data_schema_df['pub_date'] = data_schema_df['timestamp'] df2 = pd.DataFrame() for index, data_old in data_schema_df.iterrows(): schema_details = out_data.query( "id == @data_old.id").copy() if not schema_details.empty: data_schema_df['timestamp'] = pd.to_datetime(start) # 更新调出时间 schema_details['out_date'] = pd.to_datetime(start) data_schema_df['pub_date'] = pd.to_datetime(start) df2 = df2.append(schema_details) else: df2 = df2.append(data_old) df_to_db(df=df2, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None elif not df_new.empty and not out_data.empty: # 有调入,有调出 data_schema_df['timestamp'] = pd.to_datetime(start) data_schema_df['pub_date'] = data_schema_df['timestamp'] df2 = pd.DataFrame() # 处理调出 for index, data_old in data_schema_df.iterrows(): schema_details = out_data.query( "id == @data_old.id").copy() if not schema_details.empty: # 更新调出时间 schema_details['out_date'] = pd.to_datetime(start) schema_details['pub_date'] = pd.to_datetime(start) df2 = df2.append(schema_details) else: # 无调出的不变 df2 = df2.append(data_old) # 处理调入,新增 df_new['timestamp'] = pd.to_datetime(start) df_new['pub_date'] = df_new['timestamp'] df_new['into_date'] = start df_new['out_date'] = pd.to_datetime("2200-01-01") df_new['pub_date2'] = df_new['timestamp'].apply( lambda x: to_time_str(x)) df_new['stock_code'] = df_new['code'].apply( lambda x: str(x).split('.')[0]) df_new['entity_id'] = entity.id df_new['id'] = df_new[['entity_id', 'stock_id']].apply( lambda x: '_'.join(x.astype(str)), axis=1) df_new['code'] = entity.code df_new['entity_type'] = entity.entity_type df_new['exchange'] = entity.exchange df_new['name'] = entity.name df2 = df2.append(df_new) df_to_db(df=df2, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None elif not df_new.empty and out_data.empty: # 处理调入,新增 df_new['timestamp'] = pd.to_datetime(start) df_new['pub_date'] = df_new['timestamp'] df_new['into_date'] = start df_new['out_date'] = pd.to_datetime("2200-01-01") df_new['pub_date2'] = df_new['timestamp'].apply( lambda x: to_time_str(x)) df_new['stock_code'] = df_new['code'].apply( lambda x: str(x).split('.')[0]) df_new['entity_id'] = entity.id df_new['id'] = df_new[['entity_id', 'stock_id']].apply( lambda x: '_'.join(x.astype(str)), axis=1) df_new['code'] = entity.code df_new['entity_type'] = entity.entity_type df_new['exchange'] = entity.exchange df_new['name'] = entity.name df_to_db(df=df_new, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None else: print('1') # self.logger.info(df.tail()) self.logger.info(f"persist etf {entity.code} portfolio success") return None