def fetch_A_secucode_innercode_map(juyuan_conn: PyMysqlPoolBase): ''' 给出任意一个 SecuCode, 不管是上市还是退市, 不管是变更前还是变更后, 均在这此找到唯一的InnerCode ''' sql = """ ( select A.SecuCode, A.InnerCode, A.SecuAbbr, A.ListedDate, B.ChangeDate '退市日期' from gildata.SecuMain A left join gildata.LC_ListStatus B on A.InnerCode = B.InnerCode and B.ChangeType = 4 where A.SecuMarket in (83, 90) and A.SecuCategory in (1, 41) and A.ListedSector in (1, 2, 6, 7) and A.ListedDate <= CURDATE() ) UNION ( SELECT A.SecuCode, A.InnerCode, B.SecuAbbr, A.BeginDate '启用日期', A.StopDate '停用日期' FROM gildata.LC_CodeChange A JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory in (1, 41) WHERE A.CodeDefine = 1 AND A.SecuCode <> B.SecuCode ); """ exec_res = juyuan_conn.select_all(sql) map1 = {} for one in exec_res: map1[one['SecuCode']] = one['InnerCode'] sql = f'''select InnerCode, SecuMarket from secumain where InnerCode in {tuple(map1.values())};''' res = juyuan_conn.select_all(sql) map2 = {} for r in res: map2[r['InnerCode']] = r['SecuMarket'] info = {} for k, v in map1.items(): if map2[v] == 83: k = "SH" + k info[k] = v elif map2[v] == 90: k = "SZ" + k info[k] = v else: raise return info
def send_crawl_overview(): conn = PyMysqlPoolBase(host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB) spiders_info = { 'juchao_ant2': 'AntTime', 'juchao_ant_finance': 'AntTime', 'juchao_kuaixun': 'pub_date', 'announcement_base2': 'PubDatetime1', # TODO v1 版本的讯息 'juchao_ant': 'AntTime', 'announcement_base': 'PubDatetime1', 'spy_announcement_data': 'pub_datetime', 'an_announcement_secu_ref': 'update_time', } msg = '' for table_name, field in spiders_info.items(): count = get_inc_num(conn, table_name, field) msg += f'{table_name} 相比昨日新增的个数是 {count}\n' print(msg) ding_msg(msg) return msg
def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, )
def get_inc_num(conn: PyMysqlPoolBase, table_name: str, field: str): query_sql = ''' select count(id) as inc_count from {} where {} >= date_sub(CURDATE(), interval 1 day); ''' query_sql = query_sql.format(table_name, field) inc_count = conn.select_one(query_sql).get("inc_count") return inc_count
def __init__(self): self.merge_table_name = 'announcement_base' self.his_table = 'juchao_ant' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = PyMysqlPoolBase( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, db=R_SPIDER_MYSQL_DB, ) self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, )
def __init__(self): self.read_spider_conn = PyMysqlPoolBase( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, db=R_SPIDER_MYSQL_DB, ) self.spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) self.juyuan_conn = PyMysqlPoolBase( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, db=JUY_DB, ) self.batch_count = 10000 self.codes_notfound = set()
def _init_pool(self, cfg: dict): """ eg. conf = { "host": LOCAL_MYSQL_HOST, "port": LOCAL_MYSQL_PORT, "user": LOCAL_MYSQL_USER, "password": LOCAL_MYSQL_PASSWORD, "db": LOCAL_MYSQL_DB, } :param cfg: :return: """ pool = PyMysqlPoolBase(**cfg) return pool
def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) self._juyuan_conn = PyMysqlPoolBase( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, db=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' }
class SourceAnnouncementBaseV1(object): """将两个爬虫表合并生成公告基础表""" def __init__(self): self.merge_table_name = 'announcement_base' self.his_table = 'juchao_ant' self.live_table = 'juchao_kuaixun' self.batch_number = 10000 self._r_spider_conn = PyMysqlPoolBase( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, db=R_SPIDER_MYSQL_DB, ) self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) def daily_update(self, deadline: datetime.datetime = None): if deadline is None: deadline = datetime.datetime.now() - datetime.timedelta(days=1) load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, \ AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \ UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline) logger.info(load_sql) items = [] datas = self._r_spider_conn.select_all(load_sql) logger.info(len(datas)) for data in datas: data = utils.process_secucode(data) if data: items.append(data) self._spider_conn._batch_save(items, self.merge_table_name, [ 'SecuCode', 'SecuAbbr', 'PubDatetime1', 'InsertDatetime1', 'Title1' ]) update_sql = ''' select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' and A.code = B.SecuCode \ and A.link = B.AntDoc and A.type = '公告'; '''.format(deadline) datas = self._r_spider_conn.select_all(update_sql) for data in datas: item = { 'PubDatetime2': data.get("pub_date"), 'InsertDatetime2': data.get("CREATETIMEJZ"), 'Title2': data.get("title"), } self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))
class JuchaoLiveNewsSpider(object): """巨潮快讯爬虫""" def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) self._juyuan_conn = PyMysqlPoolBase( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, db=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' } def get_secu_abbr(self, code): """ 从聚源数据库获取证券代码对应的中文简称 :param code: 无前缀的证券代码 :return: """ sql = f'''select SecuAbbr from secumain where secucode=%s;''' name = self._juyuan_conn.select_one(sql, code).get("SecuAbbr") return name def get_redit_link(self, link): """获取最终重定向后的网址""" resp = self.my_get(link) redit_list = resp.history try: redit_link = redit_list[len(redit_list) - 1].headers["location"] except IndexError: redit_link = link except: return None return redit_link @retry(stop_max_attempt_number=5) def my_get(self, link): # 请求 超时\被ban 时重试 logger.debug(f'get in {link} .. ') resp = requests.get(link, headers=self.headers, timeout=5) return resp def parse(self, url: str, day_string: str): items = [] resp = self.my_get(url) if resp and resp.status_code == 200: text = resp.text datas = json.loads(text) if not datas: print("{} 无公告数据".format(url)) else: for data in datas: item = {} # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称 announcementTime = time.localtime(int(data.get("announcementTime") / 1000)) announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime) item['pub_date'] = announcementTime item['type'] = data.get("type") item['title'] = data.get("title") page_path = data.get("pagePath") if page_path is None: link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format( data.get("textId"), day_string) else: try: link = self.get_redit_link(page_path) except: link = None if not link: continue item['link'] = link code = data.get("code") if code: item['code'] = code # 无前缀的证券代码 item['name'] = self.get_secu_abbr(code) items.append(item) return items def start(self, start_dt: datetime.datetime = None, end_dt: datetime.datetime = None): """启动入口""" # 使用传入的结束时间否则置为今天 if end_dt is not None: end_dt = datetime.datetime.combine(end_dt, datetime.time.min) else: end_dt = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min) # 使用传入的开始时间 否则置为结束时间之前 3 天 if start_dt is not None: start_dt = datetime.datetime.combine(start_dt, datetime.time.min) else: start_dt = end_dt - datetime.timedelta(days=3) # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间 this_day = end_dt while this_day >= start_dt: day_string = this_day.strftime("%Y-%m-%d") items = self.parse(self.api_url.format(day_string), day_string) logger.info(f"{day_string}, {len(items)}") self._spider_conn._batch_save(items, self.table_name, self.fields) this_day -= datetime.timedelta(days=1) time.sleep(3)
'spy_announcement_data': 'pub_datetime', 'an_announcement_secu_ref': 'update_time', } msg = '' for table_name, field in spiders_info.items(): count = get_inc_num(conn, table_name, field) msg += f'{table_name} 相比昨日新增的个数是 {count}\n' print(msg) ding_msg(msg) return msg if __name__ == '__main__': # ding_msg('just test') # send_crawl_overview() juyuanconn = PyMysqlPoolBase( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, db=JUY_DB, ) ret = fetch_A_secucode_innercode_map(juyuanconn) print(ret)
class LoadOriginAnnData: spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) if LOCAL: product_read_spider_conn = PyMysqlPoolBase( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, db=R_SPIDER_MYSQL_DB, ) else: product_read_spider_conn = spider_conn category_name_code_map = { 'category_bcgz_szsh': ('补充更正', 19), 'category_bndbg_szsh': ('半年报', 2), 'category_cqdq_szsh': ('澄清致歉', 12), 'category_dshgg_szsh': ('董事会', 18), 'category_fxts_szsh': ('风险提示', 21), 'category_gddh_szsh': ('股东大会', 15), 'category_gqbd_szsh': ('股权变动', 16), 'category_gqjl_szsh': ('股权激励', 17), 'category_gszl_szsh': ('公司治理', 24), 'category_gszq_szsh': ('公司债', 25), 'category_jj_szsh': ('解禁', 9), 'category_jshgg_szsh': ('监事会', 14), 'category_kzzq_szsh': ('可转债', 22), 'category_ndbg_szsh': ('年报', 4), 'category_pg_szsh': ('配股', 7), 'category_qtrz_szsh': ('其他融资', 23), 'category_qyfpxzcs_szsh': ('权益分派', 11), 'category_rcjy_szsh': ('日常经营', 10), 'category_sf_szsh': ('首发', 8), 'category_sjdbg_szsh': ('三季报', 3), 'category_tbclts_szsh': ('特别处理和退市', 13), 'category_tszlq_szsh': ('退市整理期', 20), 'category_yjdbg_szsh': ('一季报', 1), 'category_yjygjxz_szsh': ('业绩预告', 5), 'category_zf_szsh': ('增发', 6), 'category_zj_szsh': ('中介报告', 26), 'category_others': ('其他', 27), } batch_num = 10000 fields = [ 'secu_codes', 'category_codes', 'ann_classify', 'title', 'pdf_link', 'pub_datetime' ] def start(self): sql_create_table = ''' CREATE TABLE IF NOT EXISTS `spy_announcement_data` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '主键', `cninfo_announcement_id` bigint(20) NOT NULL COMMENT '巨潮公告ID', `secu_codes` varchar(100) COLLATE utf8_bin NOT NULL COMMENT '关联证券代码, 多个代码使用逗号(,)分隔', `category_codes` varchar(200) NOT NULL COMMENT '巨潮分类,多个分类代码使用(,)分隔', `ann_classify` tinyint(4) NOT NULL COMMENT '公告分类(1:沪深 2:港股 3:三板 4:基金 5:债券 6:监管 7:预披露)', `title` varchar(1000) COLLATE utf8_bin NOT NULL COMMENT '公告标题', `pdf_link` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '公告pdf地址', `pub_datetime` datetime NOT NULL COMMENT '公告发布时间(巨潮公告速递栏目中的时间)', `create_by` int(11) NOT NULL COMMENT '创建人', `update_by` int(11) NOT NULL COMMENT '更新人', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', PRIMARY KEY (`id`), UNIQUE KEY `cn_ann_id` (`cninfo_announcement_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='公告采集原始表' ; ''' sql_create_table2 = ''' CREATE TABLE IF NOT EXISTS `an_announcement_secu_ref` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '主键', `ann_id` int(20) NOT NULL COMMENT 'spy_announcement_data 表 id', `secu_id` int(20) NOT NULL COMMENT '公告关联证券', `create_by` int(11) NOT NULL COMMENT '创建人', `update_by` int(11) NOT NULL COMMENT '更新人', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', `update_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', PRIMARY KEY (`id`), UNIQUE KEY `annid_secuid` (`ann_id`, `secu_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='证券公告关联表' ; ''' self.spider_conn._exec_sql(sql_create_table) self.spider_conn._exec_sql(sql_create_table2) sql_fetch_maxid = '''select max(id) from juchao_ant2 where AntTime <= '2021-02-18' ; ''' max_id = self.product_read_spider_conn.select_one(sql_fetch_maxid).get( "max(id)") print(max_id) sql_load = ''' select id as id, AntId as cninfo_announcement_id, SecuCode as secu_codes, CategoryCode, AntTitle as title, AntDoc as pdf_link, AntTime as pub_datetime from juchao_ant2 where AntTime <= '2021-02-18' and id >= {} and id < {} ; ''' items = [] for i in range(int(max_id / self.batch_num) + 1): sl = sql_load.format(i * self.batch_num, i * self.batch_num + self.batch_num) print(sl) juchao_ants_datas = self.product_read_spider_conn.select_all(sl) print(len(juchao_ants_datas)) for data in juchao_ants_datas: if data.get('CategoryCode') == '': cate_code = 27 else: try: cate_code = self.category_name_code_map[data.pop( 'CategoryCode')][1] except: raise ValueError(f'{data}数据异常') data['category_codes'] = cate_code data['create_by'] = 0 data['update_by'] = 0 data['ann_classify'] = 1 items.append(data) if len(items) > 100: self.spider_conn._batch_save(items, 'spy_announcement_data', self.fields) items = [] self.spider_conn._batch_save(items, 'spy_announcement_data', self.fields)
def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) self.cat_map = { 'category_bcgz_szsh': ('补充更正', 19), 'category_bndbg_szsh': ('半年报', 2), 'category_cqdq_szsh': ('澄清致歉', 12), 'category_dshgg_szsh': ('董事会', 18), 'category_fxts_szsh': ('风险提示', 21), 'category_gddh_szsh': ('股东大会', 15), 'category_gqbd_szsh': ('股权变动', 16), 'category_gqjl_szsh': ('股权激励', 17), 'category_gszl_szsh': ('公司治理', 24), 'category_gszq_szsh': ('公司债', 25), 'category_jj_szsh': ('解禁', 9), 'category_jshgg_szsh': ('监事会', 14), 'category_kzzq_szsh': ('可转债', 22), 'category_ndbg_szsh': ('年报', 4), 'category_pg_szsh': ('配股', 7), 'category_qtrz_szsh': ('其他融资', 23), 'category_qyfpxzcs_szsh': ('权益分派', 11), 'category_rcjy_szsh': ('日常经营', 10), 'category_sf_szsh': ('首发', 8), 'category_sjdbg_szsh': ('三季报', 3), 'category_tbclts_szsh': ('特别处理和退市', 13), 'category_tszlq_szsh': ('退市整理期', 20), 'category_yjdbg_szsh': ('一季报', 1), 'category_yjygjxz_szsh': ('业绩预告', 5), 'category_zf_szsh': ('增发', 6), 'category_zj_szsh': ('中介报告', 26), 'category_others': ('其他', 27), }
class AnnSecuRef(object): def __init__(self): self.read_spider_conn = PyMysqlPoolBase( host=R_SPIDER_MYSQL_HOST, port=R_SPIDER_MYSQL_PORT, user=R_SPIDER_MYSQL_USER, password=R_SPIDER_MYSQL_PASSWORD, db=R_SPIDER_MYSQL_DB, ) self.spider_conn = PyMysqlPoolBase( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, db=SPIDER_MYSQL_DB, ) self.juyuan_conn = PyMysqlPoolBase( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, db=JUY_DB, ) self.batch_count = 10000 self.codes_notfound = set() def fetch_bas_secumain(self) -> dict: # 将 bas_secumain 的全部数据加载到内存中 bas_sql = '''select id, secu_code from bas_secumain where secu_category in (1, 41); ''' bas_datas = self.read_spider_conn.select_all(bas_sql) bas_map = {} for data in bas_datas: bas_map[data['secu_code']] = data['id'] return bas_map def update_rename_codes(self, bas_map: dict) -> dict: # 扩充映射 新增改名的部分 sql_fetch_raname_secucodes = ''' SELECT A.SecuCode 'old_code', B.SecuCode 'new_code', A.* FROM gildata.LC_CodeChange A JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory = 1 WHERE A.CodeDefine = 1 AND A.SecuCode <> B.SecuCode ORDER BY A.StopDate; ''' rename_datas = self.juyuan_conn.select_all(sql_fetch_raname_secucodes) for rename_data in rename_datas: new_code = rename_data['new_code'] secu_id = bas_map[new_code] old_code = rename_data['old_code'] bas_map.update({old_code: secu_id}) return bas_map def get_max_spyid(self): # 获取爬虫表中目前的最大 id sql_get_maxid = '''select max(id) from spy_announcement_data; ''' max_id = self.read_spider_conn.select_one(sql_get_maxid).get("max(id)") return max_id def process_spy_datas(self, origin_ann_datas: list, bas_map: dict): # 处理爬虫数据 生成插入数据 items = [] for origin_data in origin_ann_datas: item = dict() item['ann_id'] = origin_data['id'] secu_codes = origin_data['secu_codes'] # 对于沪深公告来说 目前secu_codes只有一个 secu_id = bas_map.get(secu_codes) if secu_id is None: logger.warning(secu_codes) self.codes_notfound.add(secu_codes) continue item['secu_id'] = secu_id item['create_by'] = 0 item['update_by'] = 0 # print(item) items.append(item) if len(items) > 10000: count = self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ]) logger.debug(count) items = [] self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ]) logger.warning(f'未匹配证券代码: {self.codes_notfound}') def diff_ids(self): '''对比两张表的 id 只取差值部分 可在数据不一致时执行此函数 ''' sql = '''select id from spy_announcement_data ; ''' ret = self.read_spider_conn.select_all(sql) spy_ids = set([r.get("id") for r in ret]) print(len(spy_ids)) sql = '''select ann_id from an_announcement_secu_ref; ''' ret = self.spider_conn.select_all(sql) ref_ids = set([r.get("ann_id") for r in ret]) print(len(ref_ids)) diff_ids = tuple(spy_ids - ref_ids) logger.info(len(diff_ids)) bas_map = self.fetch_bas_secumain() bas_map = self.update_rename_codes(bas_map) sql = f'''select * from spy_announcement_data where id in {diff_ids}''' spy_datas = self.read_spider_conn.select_all(sql) logger.info(len(spy_datas)) self.process_spy_datas(spy_datas, bas_map) def init_load(self): # 在关联表为空时 初始化导入 bas_map = self.fetch_bas_secumain() bas_map = self.update_rename_codes(bas_map) max_id = self.get_max_spyid() logger.info(f'Now max(id) of spy_announcement_data is {max_id}') for i in range(int(max_id / self.batch_count) + 1): _start = i * self.batch_count _end = i * self.batch_count + self.batch_count sql = f'''select id, secu_codes from spy_announcement_data where id >= {_start} and id < {_end}; ''' origin_ann_datas = self.read_spider_conn.select_all(sql) self.process_spy_datas(origin_ann_datas, bas_map) def daily_sync(self, start_dt: datetime.datetime = None): # 在爬虫原始表中存在新数据时日常的同步关联表操作 bas_map = self.fetch_bas_secumain() bas_map = self.update_rename_codes(bas_map) if start_dt is None: # 每次启动定位开始时间为当日开始时间 start_dt = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min) sql = f'''select id, secu_codes from spy_announcement_data where update_time >= '{start_dt}'; ''' origin_ann_datas = self.read_spider_conn.select_all(sql) self.process_spy_datas(origin_ann_datas, bas_map)