Beispiel #1
0
def fetch_A_secucode_innercode_map(juyuan_conn: PyMysqlPoolBase):
    '''
    给出任意一个 SecuCode, 不管是上市还是退市, 不管是变更前还是变更后, 均在这此找到唯一的InnerCode
    '''

    sql = """
    (
    select A.SecuCode, A.InnerCode, A.SecuAbbr, A.ListedDate, B.ChangeDate '退市日期'
    from gildata.SecuMain A
    left join 
    gildata.LC_ListStatus B 
    on A.InnerCode = B.InnerCode 
    and B.ChangeType = 4 
    where 
    A.SecuMarket in (83, 90) 
    and A.SecuCategory in (1, 41) 
    and A.ListedSector in (1, 2, 6, 7) 
    and A.ListedDate <= CURDATE()
    )
    UNION
    (
    SELECT
    A.SecuCode, A.InnerCode, B.SecuAbbr, A.BeginDate '启用日期', A.StopDate '停用日期'
    FROM gildata.LC_CodeChange A
    JOIN gildata.SecuMain B
    ON A.InnerCode = B.InnerCode
    AND B.SecuMarket IN (83,90)
    AND B.SecuCategory in (1, 41) 
    WHERE
    A.CodeDefine = 1
    AND A.SecuCode <> B.SecuCode
    ); 
    """

    exec_res = juyuan_conn.select_all(sql)
    map1 = {}
    for one in exec_res:
        map1[one['SecuCode']] = one['InnerCode']

    sql = f'''select InnerCode, SecuMarket from secumain where InnerCode in {tuple(map1.values())};'''
    res = juyuan_conn.select_all(sql)
    map2 = {}
    for r in res:
        map2[r['InnerCode']] = r['SecuMarket']
    info = {}

    for k, v in map1.items():
        if map2[v] == 83:
            k = "SH" + k
            info[k] = v
        elif map2[v] == 90:
            k = "SZ" + k
            info[k] = v
        else:
            raise

    return info
Beispiel #2
0
def send_crawl_overview():
    conn = PyMysqlPoolBase(host=SPIDER_MYSQL_HOST,
                           port=SPIDER_MYSQL_PORT,
                           user=SPIDER_MYSQL_USER,
                           password=SPIDER_MYSQL_PASSWORD,
                           db=SPIDER_MYSQL_DB)
    spiders_info = {
        'juchao_ant2': 'AntTime',
        'juchao_ant_finance': 'AntTime',
        'juchao_kuaixun': 'pub_date',
        'announcement_base2': 'PubDatetime1',

        # TODO v1 版本的讯息
        'juchao_ant': 'AntTime',
        'announcement_base': 'PubDatetime1',
        'spy_announcement_data': 'pub_datetime',
        'an_announcement_secu_ref': 'update_time',
    }

    msg = ''
    for table_name, field in spiders_info.items():
        count = get_inc_num(conn, table_name, field)
        msg += f'{table_name} 相比昨日新增的个数是 {count}\n'

    print(msg)
    ding_msg(msg)
    return msg
 def __init__(self):
     self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
     self.headers = {
         'Accept':
         '*/*',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'zh-CN,zh;q=0.9,en;q=0.8',
         'Cache-Control':
         'no-cache',
         'Connection':
         'keep-alive',
         'Content-Type':
         'application/x-www-form-urlencoded; charset=UTF-8',
         'Host':
         'www.cninfo.com.cn',
         'Origin':
         'http://www.cninfo.com.cn',
         'Pragma':
         'no-cache',
         'Referer':
         'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
     }
     self._spider_conn = PyMysqlPoolBase(
         host=SPIDER_MYSQL_HOST,
         port=SPIDER_MYSQL_PORT,
         user=SPIDER_MYSQL_USER,
         password=SPIDER_MYSQL_PASSWORD,
         db=SPIDER_MYSQL_DB,
     )
Beispiel #4
0
def get_inc_num(conn: PyMysqlPoolBase, table_name: str, field: str):
    query_sql = '''
    select count(id) as inc_count from {} where {} >= date_sub(CURDATE(), interval 1 day);
    '''
    query_sql = query_sql.format(table_name, field)
    inc_count = conn.select_one(query_sql).get("inc_count")
    return inc_count
Beispiel #5
0
    def __init__(self):
        self.merge_table_name = 'announcement_base'
        self.his_table = 'juchao_ant'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )
    def __init__(self):
        self.read_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )

        self.spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )

        self.juyuan_conn = PyMysqlPoolBase(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            db=JUY_DB,
        )

        self.batch_count = 10000
        self.codes_notfound = set()
Beispiel #7
0
 def _init_pool(self, cfg: dict):
     """
     eg.
     conf = {
             "host": LOCAL_MYSQL_HOST,
             "port": LOCAL_MYSQL_PORT,
             "user": LOCAL_MYSQL_USER,
             "password": LOCAL_MYSQL_PASSWORD,
             "db": LOCAL_MYSQL_DB,
     }
     :param cfg:
     :return:
     """
     pool = PyMysqlPoolBase(**cfg)
     return pool
Beispiel #8
0
 def __init__(self):
     self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
     self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
     self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
     self.table_name = 'juchao_kuaixun'
     self.name = '巨潮快讯'
     self._spider_conn = PyMysqlPoolBase(
         host=SPIDER_MYSQL_HOST,
         port=SPIDER_MYSQL_PORT,
         user=SPIDER_MYSQL_USER,
         password=SPIDER_MYSQL_PASSWORD,
         db=SPIDER_MYSQL_DB,
     )
     self._juyuan_conn = PyMysqlPoolBase(
         host=JUY_HOST,
         port=JUY_PORT,
         user=JUY_USER,
         password=JUY_PASSWD,
         db=JUY_DB,
     )
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
                       'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
     }
Beispiel #9
0
class SourceAnnouncementBaseV1(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base'
        self.his_table = 'juchao_ant'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )

    def daily_update(self, deadline: datetime.datetime = None):
        if deadline is None:
            deadline = datetime.datetime.now() - datetime.timedelta(days=1)

        load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \
UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline)
        logger.info(load_sql)

        items = []
        datas = self._r_spider_conn.select_all(load_sql)
        logger.info(len(datas))
        for data in datas:
            data = utils.process_secucode(data)
            if data:
                items.append(data)
        self._spider_conn._batch_save(items, self.merge_table_name, [
            'SecuCode', 'SecuAbbr', 'PubDatetime1', 'InsertDatetime1', 'Title1'
        ])

        update_sql = '''
        select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' and A.code = B.SecuCode \
and A.link = B.AntDoc and A.type = '公告';  '''.format(deadline)
        datas = self._r_spider_conn.select_all(update_sql)
        for data in datas:
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._spider_conn.table_update(self.merge_table_name, item,
                                           'PDFLink', data.get("link"))
Beispiel #10
0
class JuchaoLiveNewsSpider(object):
    """巨潮快讯爬虫"""
    def __init__(self):
        self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
        self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
        self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
        self.table_name = 'juchao_kuaixun'
        self.name = '巨潮快讯'
        self._spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = PyMysqlPoolBase(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            db=JUY_DB,
        )
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
                          'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def get_secu_abbr(self, code):
        """
        从聚源数据库获取证券代码对应的中文简称
        :param code: 无前缀的证券代码
        :return:
        """
        sql = f'''select SecuAbbr from secumain where secucode=%s;'''
        name = self._juyuan_conn.select_one(sql, code).get("SecuAbbr")
        return name

    def get_redit_link(self, link):
        """获取最终重定向后的网址"""
        resp = self.my_get(link)
        redit_list = resp.history
        try:
            redit_link = redit_list[len(redit_list) - 1].headers["location"]
        except IndexError:
            redit_link = link
        except:
            return None
        return redit_link

    @retry(stop_max_attempt_number=5)
    def my_get(self, link):
        # 请求 超时\被ban 时重试
        logger.debug(f'get in {link} .. ')
        resp = requests.get(link, headers=self.headers, timeout=5)
        return resp

    def parse(self, url: str, day_string: str):
        items = []
        resp = self.my_get(url)
        if resp and resp.status_code == 200:
            text = resp.text
            datas = json.loads(text)
            if not datas:
                print("{} 无公告数据".format(url))
            else:
                for data in datas:
                    item = {}
                    # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称
                    announcementTime = time.localtime(int(data.get("announcementTime") / 1000))
                    announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime)
                    item['pub_date'] = announcementTime
                    item['type'] = data.get("type")
                    item['title'] = data.get("title")
                    page_path = data.get("pagePath")
                    if page_path is None:
                        link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format(
                            data.get("textId"), day_string)
                    else:
                        try:
                            link = self.get_redit_link(page_path)
                        except:
                            link = None
                    if not link:
                        continue
                    item['link'] = link
                    code = data.get("code")
                    if code:
                        item['code'] = code  # 无前缀的证券代码
                        item['name'] = self.get_secu_abbr(code)
                        items.append(item)
        return items

    def start(self, start_dt: datetime.datetime = None, end_dt: datetime.datetime = None):
        """启动入口"""
        # 使用传入的结束时间否则置为今天
        if end_dt is not None:
            end_dt = datetime.datetime.combine(end_dt, datetime.time.min)
        else:
            end_dt = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min)

        # 使用传入的开始时间 否则置为结束时间之前 3 天
        if start_dt is not None:
            start_dt = datetime.datetime.combine(start_dt, datetime.time.min)
        else:
            start_dt = end_dt - datetime.timedelta(days=3)

        # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间
        this_day = end_dt
        while this_day >= start_dt:
            day_string = this_day.strftime("%Y-%m-%d")
            items = self.parse(self.api_url.format(day_string), day_string)
            logger.info(f"{day_string}, {len(items)}")
            self._spider_conn._batch_save(items, self.table_name, self.fields)
            this_day -= datetime.timedelta(days=1)
            time.sleep(3)
Beispiel #11
0
        'spy_announcement_data': 'pub_datetime',
        'an_announcement_secu_ref': 'update_time',
    }

    msg = ''
    for table_name, field in spiders_info.items():
        count = get_inc_num(conn, table_name, field)
        msg += f'{table_name} 相比昨日新增的个数是 {count}\n'

    print(msg)
    ding_msg(msg)
    return msg


if __name__ == '__main__':
    # ding_msg('just test')

    # send_crawl_overview()

    juyuanconn = PyMysqlPoolBase(
        host=JUY_HOST,
        port=JUY_PORT,
        user=JUY_USER,
        password=JUY_PASSWD,
        db=JUY_DB,
    )

    ret = fetch_A_secucode_innercode_map(juyuanconn)

    print(ret)
class LoadOriginAnnData:
    spider_conn = PyMysqlPoolBase(
        host=SPIDER_MYSQL_HOST,
        port=SPIDER_MYSQL_PORT,
        user=SPIDER_MYSQL_USER,
        password=SPIDER_MYSQL_PASSWORD,
        db=SPIDER_MYSQL_DB,
    )
    if LOCAL:
        product_read_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )
    else:
        product_read_spider_conn = spider_conn

    category_name_code_map = {
        'category_bcgz_szsh': ('补充更正', 19),
        'category_bndbg_szsh': ('半年报', 2),
        'category_cqdq_szsh': ('澄清致歉', 12),
        'category_dshgg_szsh': ('董事会', 18),
        'category_fxts_szsh': ('风险提示', 21),
        'category_gddh_szsh': ('股东大会', 15),
        'category_gqbd_szsh': ('股权变动', 16),
        'category_gqjl_szsh': ('股权激励', 17),
        'category_gszl_szsh': ('公司治理', 24),
        'category_gszq_szsh': ('公司债', 25),
        'category_jj_szsh': ('解禁', 9),
        'category_jshgg_szsh': ('监事会', 14),
        'category_kzzq_szsh': ('可转债', 22),
        'category_ndbg_szsh': ('年报', 4),
        'category_pg_szsh': ('配股', 7),
        'category_qtrz_szsh': ('其他融资', 23),
        'category_qyfpxzcs_szsh': ('权益分派', 11),
        'category_rcjy_szsh': ('日常经营', 10),
        'category_sf_szsh': ('首发', 8),
        'category_sjdbg_szsh': ('三季报', 3),
        'category_tbclts_szsh': ('特别处理和退市', 13),
        'category_tszlq_szsh': ('退市整理期', 20),
        'category_yjdbg_szsh': ('一季报', 1),
        'category_yjygjxz_szsh': ('业绩预告', 5),
        'category_zf_szsh': ('增发', 6),
        'category_zj_szsh': ('中介报告', 26),
        'category_others': ('其他', 27),
    }
    batch_num = 10000
    fields = [
        'secu_codes', 'category_codes', 'ann_classify', 'title', 'pdf_link',
        'pub_datetime'
    ]

    def start(self):
        sql_create_table = '''
        CREATE TABLE IF NOT EXISTS `spy_announcement_data` (
          `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '主键',
          `cninfo_announcement_id` bigint(20) NOT NULL COMMENT '巨潮公告ID',
          `secu_codes` varchar(100) COLLATE utf8_bin NOT NULL COMMENT '关联证券代码, 多个代码使用逗号(,)分隔',
          `category_codes` varchar(200) NOT NULL COMMENT '巨潮分类,多个分类代码使用(,)分隔',
          `ann_classify` tinyint(4) NOT NULL COMMENT '公告分类(1:沪深 2:港股 3:三板 4:基金 5:债券 6:监管 7:预披露)',
          `title` varchar(1000) COLLATE utf8_bin NOT NULL COMMENT '公告标题',
          `pdf_link` varchar(1000) COLLATE utf8_bin DEFAULT NULL COMMENT '公告pdf地址',
          `pub_datetime` datetime NOT NULL COMMENT '公告发布时间(巨潮公告速递栏目中的时间)',
          `create_by` int(11) NOT NULL COMMENT '创建人',
          `update_by` int(11) NOT NULL COMMENT '更新人',
          `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
          `update_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
          PRIMARY KEY (`id`),
          UNIQUE KEY `cn_ann_id` (`cninfo_announcement_id`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='公告采集原始表' ;
        '''

        sql_create_table2 = '''
        CREATE TABLE IF NOT EXISTS `an_announcement_secu_ref` (
          `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '主键',
          `ann_id` int(20) NOT NULL COMMENT 'spy_announcement_data 表 id',
          `secu_id` int(20) NOT NULL COMMENT '公告关联证券',
          `create_by` int(11) NOT NULL COMMENT '创建人',
          `update_by` int(11) NOT NULL COMMENT '更新人',
          `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
          `update_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
          PRIMARY KEY (`id`), 
          UNIQUE KEY `annid_secuid` (`ann_id`, `secu_id`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin ROW_FORMAT=DYNAMIC COMMENT='证券公告关联表' ;
        '''
        self.spider_conn._exec_sql(sql_create_table)
        self.spider_conn._exec_sql(sql_create_table2)

        sql_fetch_maxid = '''select max(id) from juchao_ant2 where AntTime <= '2021-02-18' ; '''
        max_id = self.product_read_spider_conn.select_one(sql_fetch_maxid).get(
            "max(id)")
        print(max_id)

        sql_load = '''
        select id as id, 
            AntId as cninfo_announcement_id, 
            SecuCode as secu_codes, 
            CategoryCode, 
            AntTitle as title, 
            AntDoc as pdf_link, 
            AntTime as pub_datetime 
            from juchao_ant2
            where AntTime <= '2021-02-18'
            and id >= {} and id < {} ;  
        '''
        items = []
        for i in range(int(max_id / self.batch_num) + 1):
            sl = sql_load.format(i * self.batch_num,
                                 i * self.batch_num + self.batch_num)
            print(sl)
            juchao_ants_datas = self.product_read_spider_conn.select_all(sl)
            print(len(juchao_ants_datas))
            for data in juchao_ants_datas:
                if data.get('CategoryCode') == '':
                    cate_code = 27
                else:
                    try:
                        cate_code = self.category_name_code_map[data.pop(
                            'CategoryCode')][1]
                    except:
                        raise ValueError(f'{data}数据异常')
                data['category_codes'] = cate_code
                data['create_by'] = 0
                data['update_by'] = 0
                data['ann_classify'] = 1
                items.append(data)
                if len(items) > 100:
                    self.spider_conn._batch_save(items,
                                                 'spy_announcement_data',
                                                 self.fields)
                    items = []

        self.spider_conn._batch_save(items, 'spy_announcement_data',
                                     self.fields)
Beispiel #13
0
 def __init__(self):
     self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
     self.headers = {
         'Accept':
         '*/*',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'zh-CN,zh;q=0.9,en;q=0.8',
         'Cache-Control':
         'no-cache',
         'Connection':
         'keep-alive',
         'Content-Type':
         'application/x-www-form-urlencoded; charset=UTF-8',
         'Host':
         'www.cninfo.com.cn',
         'Origin':
         'http://www.cninfo.com.cn',
         'Pragma':
         'no-cache',
         'Referer':
         'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
     }
     self._spider_conn = PyMysqlPoolBase(
         host=SPIDER_MYSQL_HOST,
         port=SPIDER_MYSQL_PORT,
         user=SPIDER_MYSQL_USER,
         password=SPIDER_MYSQL_PASSWORD,
         db=SPIDER_MYSQL_DB,
     )
     self.cat_map = {
         'category_bcgz_szsh': ('补充更正', 19),
         'category_bndbg_szsh': ('半年报', 2),
         'category_cqdq_szsh': ('澄清致歉', 12),
         'category_dshgg_szsh': ('董事会', 18),
         'category_fxts_szsh': ('风险提示', 21),
         'category_gddh_szsh': ('股东大会', 15),
         'category_gqbd_szsh': ('股权变动', 16),
         'category_gqjl_szsh': ('股权激励', 17),
         'category_gszl_szsh': ('公司治理', 24),
         'category_gszq_szsh': ('公司债', 25),
         'category_jj_szsh': ('解禁', 9),
         'category_jshgg_szsh': ('监事会', 14),
         'category_kzzq_szsh': ('可转债', 22),
         'category_ndbg_szsh': ('年报', 4),
         'category_pg_szsh': ('配股', 7),
         'category_qtrz_szsh': ('其他融资', 23),
         'category_qyfpxzcs_szsh': ('权益分派', 11),
         'category_rcjy_szsh': ('日常经营', 10),
         'category_sf_szsh': ('首发', 8),
         'category_sjdbg_szsh': ('三季报', 3),
         'category_tbclts_szsh': ('特别处理和退市', 13),
         'category_tszlq_szsh': ('退市整理期', 20),
         'category_yjdbg_szsh': ('一季报', 1),
         'category_yjygjxz_szsh': ('业绩预告', 5),
         'category_zf_szsh': ('增发', 6),
         'category_zj_szsh': ('中介报告', 26),
         'category_others': ('其他', 27),
     }
class AnnSecuRef(object):
    def __init__(self):
        self.read_spider_conn = PyMysqlPoolBase(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            db=R_SPIDER_MYSQL_DB,
        )

        self.spider_conn = PyMysqlPoolBase(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            db=SPIDER_MYSQL_DB,
        )

        self.juyuan_conn = PyMysqlPoolBase(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            db=JUY_DB,
        )

        self.batch_count = 10000
        self.codes_notfound = set()

    def fetch_bas_secumain(self) -> dict:
        # 将 bas_secumain 的全部数据加载到内存中
        bas_sql = '''select id, secu_code from bas_secumain where secu_category in (1, 41); '''
        bas_datas = self.read_spider_conn.select_all(bas_sql)
        bas_map = {}
        for data in bas_datas:
            bas_map[data['secu_code']] = data['id']
        return bas_map

    def update_rename_codes(self, bas_map: dict) -> dict:
        # 扩充映射 新增改名的部分
        sql_fetch_raname_secucodes = '''
                   SELECT A.SecuCode 'old_code', B.SecuCode 'new_code',
                   A.*
                   FROM gildata.LC_CodeChange A 
                   JOIN gildata.SecuMain B ON A.InnerCode = B.InnerCode AND B.SecuMarket IN (83,90) AND B.SecuCategory = 1
                   WHERE A.CodeDefine = 1 
                   AND A.SecuCode <> B.SecuCode
                   ORDER BY A.StopDate; 
               '''
        rename_datas = self.juyuan_conn.select_all(sql_fetch_raname_secucodes)
        for rename_data in rename_datas:
            new_code = rename_data['new_code']
            secu_id = bas_map[new_code]
            old_code = rename_data['old_code']
            bas_map.update({old_code: secu_id})
        return bas_map

    def get_max_spyid(self):
        # 获取爬虫表中目前的最大 id
        sql_get_maxid = '''select max(id) from spy_announcement_data; '''
        max_id = self.read_spider_conn.select_one(sql_get_maxid).get("max(id)")
        return max_id

    def process_spy_datas(self, origin_ann_datas: list, bas_map: dict):
        # 处理爬虫数据 生成插入数据
        items = []
        for origin_data in origin_ann_datas:
            item = dict()
            item['ann_id'] = origin_data['id']
            secu_codes = origin_data['secu_codes']    # 对于沪深公告来说 目前secu_codes只有一个
            secu_id = bas_map.get(secu_codes)
            if secu_id is None:
                logger.warning(secu_codes)
                self.codes_notfound.add(secu_codes)
                continue
            item['secu_id'] = secu_id
            item['create_by'] = 0
            item['update_by'] = 0
            # print(item)
            items.append(item)
            if len(items) > 10000:
                count = self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ])
                logger.debug(count)
                items = []
        self.spider_conn._batch_save(items, 'an_announcement_secu_ref', ['secu_id', ])
        logger.warning(f'未匹配证券代码: {self.codes_notfound}')

    def diff_ids(self):
        '''对比两张表的 id 只取差值部分
        可在数据不一致时执行此函数
        '''

        sql = '''select id from spy_announcement_data ; '''
        ret = self.read_spider_conn.select_all(sql)
        spy_ids = set([r.get("id") for r in ret])
        print(len(spy_ids))

        sql = '''select ann_id from an_announcement_secu_ref; '''
        ret = self.spider_conn.select_all(sql)
        ref_ids = set([r.get("ann_id") for r in ret])
        print(len(ref_ids))

        diff_ids = tuple(spy_ids - ref_ids)
        logger.info(len(diff_ids))

        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)

        sql = f'''select * from spy_announcement_data where id in {diff_ids}'''
        spy_datas = self.read_spider_conn.select_all(sql)
        logger.info(len(spy_datas))
        self.process_spy_datas(spy_datas, bas_map)

    def init_load(self):
        # 在关联表为空时 初始化导入
        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)
        max_id = self.get_max_spyid()
        logger.info(f'Now max(id) of spy_announcement_data is {max_id}')
        for i in range(int(max_id / self.batch_count) + 1):
            _start = i * self.batch_count
            _end = i * self.batch_count + self.batch_count
            sql = f'''select id, secu_codes from spy_announcement_data where id >= {_start} and id < {_end}; '''
            origin_ann_datas = self.read_spider_conn.select_all(sql)
            self.process_spy_datas(origin_ann_datas, bas_map)

    def daily_sync(self, start_dt: datetime.datetime = None):
        # 在爬虫原始表中存在新数据时日常的同步关联表操作
        bas_map = self.fetch_bas_secumain()
        bas_map = self.update_rename_codes(bas_map)
        if start_dt is None:
            # 每次启动定位开始时间为当日开始时间
            start_dt = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min)
        sql = f'''select id, secu_codes from spy_announcement_data where update_time >= '{start_dt}'; '''
        origin_ann_datas = self.read_spider_conn.select_all(sql)
        self.process_spy_datas(origin_ann_datas, bas_map)