Esempio n. 1
0
class SourceAnnouncementBaseV1(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base'
        self.his_table = 'juchao_ant'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = Connection(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            database=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )

    def daily_update(self, deadline: datetime.datetime = None):
        if deadline is None:
            deadline = datetime.datetime.now() - datetime.timedelta(days=1)

        load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \
UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline)
        logger.info(load_sql)

        items = []
        datas = self._r_spider_conn.query(load_sql)
        logger.info(len(datas))
        for data in datas:
            data = utils.process_secucode(data)
            if data:
                items.append(data)
        self._spider_conn.batch_insert(items, self.merge_table_name,
        ['SecuCode', 'SecuAbbr', 'PubDatetime1', 'InsertDatetime1', 'Title1'])

        update_sql = '''
        select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' and A.code = B.SecuCode \
and A.link = B.AntDoc and A.type = '公告';  '''.format(deadline)
        datas = self._r_spider_conn.query(update_sql)
        for data in datas:
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))
class JuchaoLiveNewsSpider(object):
    """巨潮快讯爬虫"""
    def __init__(self):
        self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
        self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
        self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
        self.table_name = 'juchao_kuaixun'
        self.name = '巨潮快讯'
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
            'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def get_secu_abbr(self, code):
        """
        从聚源数据库获取证券代码对应的中文简称
        :param code: 无前缀的证券代码
        :return:
        """
        sql = f'''select SecuAbbr from secumain where secucode=%s;'''
        name = self._juyuan_conn.get(sql, code).get("SecuAbbr")
        return name

    def get_redit_link(self, link):
        """获取最终重定向后的网址"""
        resp = self.my_get(link)
        redit_list = resp.history
        try:
            redit_link = redit_list[len(redit_list) - 1].headers["location"]
        except IndexError:
            redit_link = link
        except:
            return None
        return redit_link

    @retry(stop_max_attempt_number=5)
    def my_get(self, link):
        # 请求 超时\被ban 时重试
        logger.debug(f'get in {link} .. ')
        resp = requests.get(link, headers=self.headers, timeout=5)
        return resp

    def parse(self, url: str, day_string: str):
        items = []
        resp = self.my_get(url)
        if resp and resp.status_code == 200:
            text = resp.text
            datas = json.loads(text)
            if not datas:
                print("{} 无公告数据".format(url))
            else:
                for data in datas:
                    item = {}
                    # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称
                    announcementTime = time.localtime(
                        int(data.get("announcementTime") / 1000))
                    announcementTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                     announcementTime)
                    item['pub_date'] = announcementTime
                    item['type'] = data.get("type")
                    item['title'] = data.get("title")
                    page_path = data.get("pagePath")
                    if page_path is None:
                        link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format(
                            data.get("textId"), day_string)
                    else:
                        try:
                            link = self.get_redit_link(page_path)
                        except:
                            link = None
                    if not link:
                        continue
                    item['link'] = link
                    code = data.get("code")
                    if code:
                        item['code'] = code  # 无前缀的证券代码
                        item['name'] = self.get_secu_abbr(code)
                        items.append(item)
        return items

    def start(self,
              start_dt: datetime.datetime = None,
              end_dt: datetime.datetime = None):
        """启动入口"""
        # 使用传入的结束时间否则置为今天
        if end_dt is not None:
            end_dt = datetime.datetime.combine(end_dt, datetime.time.min)
        else:
            end_dt = datetime.datetime.combine(datetime.datetime.now(),
                                               datetime.time.min)

        # 使用传入的开始时间 否则置为结束时间之前 3 天
        if start_dt is not None:
            start_dt = datetime.datetime.combine(start_dt, datetime.time.min)
        else:
            start_dt = end_dt - datetime.timedelta(days=3)

        # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间
        this_day = end_dt
        while this_day >= start_dt:
            day_string = this_day.strftime("%Y-%m-%d")
            items = self.parse(self.api_url.format(day_string), day_string)
            logger.info(f"{day_string}, {len(items)}")
            self._spider_conn.batch_insert(items, self.table_name, self.fields)
            this_day -= datetime.timedelta(days=1)
            time.sleep(3)
class JuChaoCodeMap(object):
    def __init__(self):
        self.fields = ['code', 'orgId', 'category', 'pinyin', 'zwjc']
        self.tool_table_name = 'juchao_codemap'
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )

    def create_tools_table(self):
        sql = '''
        CREATE TABLE IF NOT EXISTS `juchao_codemap` (
            `id` int(11) NOT NULL AUTO_INCREMENT,
            `code` varchar(8) NOT NULL COMMENT '证券代码',
            `orgId` varchar(16) NOT NULL COMMENT '证券编码',
            `category` varchar(8) NOT NULL COMMENT '证券分类',
            `pinyin` varchar(10) NOT NULL COMMENT '证券中文名拼音',
            `zwjc` varchar(20) NOT NULL COMMENT '证券中文名',
            `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
            `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
            PRIMARY KEY (`id`),
            UNIQUE KEY `orgId_code` (`orgId`, `code`),
            KEY `update_time` (`UPDATETIMEJZ`)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨潮证券编码';
        '''
        self._spider_conn.execute(sql)

    def get_stock_json(self):
        api = 'http://www.cninfo.com.cn/new/data/szse_a_stock.json?_={}'.format(
            int(time.time() * 1000))
        headers = {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Host':
            'www.cninfo.com.cn',
            'Origin':
            'http://uc.cninfo.com.cn',
            'Pragma':
            'no-cache',
            'Referer':
            'http://uc.cninfo.com.cn/user/optionalConfig?groupId=88937',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        }
        resp = requests.get(api, headers=headers)
        if resp and resp.status_code == 200:
            text = resp.text
            try:
                py_data = json.loads(text).get("stockList")
            except:
                print(text)
                raise Exception("resp parse error.")
            self._spider_conn.batch_insert(py_data, self.tool_table_name,
                                           self.fields)

    def start(self):
        self.create_tools_table()
        self.get_stock_json()
Esempio n. 4
0
class SourceAnnouncementBase(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base2'
        self.his_table = 'juchao_ant2'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = Connection(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            database=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )

    def category_code_map(self):
        return {
         'category_bcgz_szsh': ('补充更正', 19),
         'category_bndbg_szsh': ('半年报', 2),
         'category_cqdq_szsh': ('澄清致歉', 12),
         'category_dshgg_szsh': ('董事会', 18),
         'category_fxts_szsh': ('风险提示', 21),
         'category_gddh_szsh': ('股东大会', 15),
         'category_gqbd_szsh': ('股权变动', 16),
         'category_gqjl_szsh': ('股权激励', 17),
         'category_gszl_szsh': ('公司治理', 24),
         'category_gszq_szsh': ('公司债', 25),
         'category_jj_szsh': ('解禁', 9),
         'category_jshgg_szsh': ('监事会', 14),
         'category_kzzq_szsh': ('可转债', 22),
         'category_ndbg_szsh': ('年报', 4),
         'category_pg_szsh': ('配股', 7),
         'category_qtrz_szsh': ('其他融资', 23),
         'category_qyfpxzcs_szsh': ('权益分派', 11),
         'category_rcjy_szsh': ('日常经营', 10),
         'category_sf_szsh': ('首发', 8),
         'category_sjdbg_szsh': ('三季报', 3),
         'category_tbclts_szsh': ('特别处理和退市', 13),
         'category_tszlq_szsh': ('退市整理期', 20),
         'category_yjdbg_szsh': ('一季报', 1),
         'category_yjygjxz_szsh': ('业绩预告', 5),
         'category_zf_szsh': ('增发', 6),
         'category_zj_szsh': ('中介报告', 26),
         'category_others': ('其他', 27),
         }

    def load_innercode_map(self):
        sql = '''select secucode, innercode from secumain; '''
        _map = dict()
        ret = self._juyuan_conn.query(sql)
        for r in ret:
            _map[r.get("secucode")] = r.get("innercode")
        return _map

    def get_old_inner_code(self, secucode: str):
        sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format(secucode)
        r = self._juyuan_conn.get(sql)
        if r:
            inner_code = r.get("InnerCode")
            return inner_code

    def daily_update(self, deadline: datetime.datetime = None):
        inner_map = self.load_innercode_map()
        category_code_map = self.category_code_map()

        if deadline is None:
            deadline = datetime.datetime.now() - datetime.timedelta(days=1)

        load_sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \
UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline)
        logger.info(f"his load sql is {load_sql}")
        datas = self._r_spider_conn.query(load_sql)
        logger.info(f"load count is {len(datas)} from his table.")
        items = []
        for data in datas:
            data['CategoryCode'] = category_code_map.get(data.get("CategoryCode"))[1]
            inner_code = inner_map.get(data.get("SecuCode"))
            if inner_code:
                data['InnerCode'] = inner_code    # 爬虫库中的 secucode 无前缀
            else:
                inner_code = self.get_old_inner_code(data.get("SecuCode"))
                if inner_code:
                    inner_map.update({data.get("SecuCode"): inner_code})
                    data['InnerCode'] = inner_code
                else:
                    continue
            data = utils.process_secucode(data)  # 暂时只要 0 3 6
            if data:
                items.append(data)

        self._spider_conn.batch_insert(items, self.merge_table_name,
        ['SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1', 'InsertDatetime1', 'Title1', 'InnerCode'])

        update_sql = '''select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' \
and A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告';  '''.format(deadline)
        logger.info(f"live sql is {update_sql}")
        datas = self._r_spider_conn.query(update_sql)
        logger.info(f'load count {len(datas)} from live table.')
        for data in datas:
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))