예제 #1
0
def get_inc_num(conn: sql_base.Connection, table_name: str, field: str):
    query_sql = '''
    select count(id) as inc_count from {} where {} >= date_sub(CURDATE(), interval 1 day);
    '''
    query_sql = query_sql.format(table_name, field)
    inc_count = conn.get(query_sql).get("inc_count")
    return inc_count
예제 #2
0
class UpdateInnerCode(object):
    def __init__(self):
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )
        self.batch_num = 10000

    def max_id(self):
        sql = '''select max(id) as max_id from announcement_base2; '''
        return self._spider_conn.get(sql).get("max_id")

    def load_innercode_map(self):
        sql = '''select secucode, innercode from secumain; '''
        _map = dict()
        ret = self._juyuan_conn.query(sql)
        for r in ret:
            _map[r.get("secucode")] = r.get("innercode")
        return _map

    def get_old_inner_code(self, secucode: str):
        sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format(
            secucode)
        r = self._juyuan_conn.get(sql)
        if r:
            inner_code = r.get("InnerCode")
            return inner_code

    def start(self, begin: int, end: int):
        inner_map = self.load_innercode_map()  # secucode 无前缀
        # max_id = self.max_id()
        # print(max_id)

        base_sql = '''select id, secucode from announcement_base2 where id between {} and {} and InnerCode = 0; '''

        for i in range(begin, end):
            sql = base_sql.format(i * self.batch_num,
                                  i * self.batch_num + self.batch_num)
            print(sql)
            datas = self._spider_conn.query(sql)
            print(len(datas))
            for data in datas:
                inner_code = inner_map.get(data.get('secucode')[2:])
                if inner_code is None:
                    inner_code = self.get_old_inner_code(
                        data.get('secucode')[2:])
                    if inner_code is not None:
                        inner_map.update(
                            {data.get('secucode')[2:]: inner_code})
                        self._spider_conn.table_update(
                            'announcement_base2', {'InnerCode': inner_code},
                            "id", data.get("id"))
                else:
                    self._spider_conn.table_update('announcement_base2',
                                                   {'InnerCode': inner_code},
                                                   "id", data.get("id"))
class JuchaoLiveNewsSpider(object):
    """巨潮快讯爬虫"""
    def __init__(self):
        self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
        self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
        self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
        self.table_name = 'juchao_kuaixun'
        self.name = '巨潮快讯'
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
            'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def get_secu_abbr(self, code):
        """
        从聚源数据库获取证券代码对应的中文简称
        :param code: 无前缀的证券代码
        :return:
        """
        sql = f'''select SecuAbbr from secumain where secucode=%s;'''
        name = self._juyuan_conn.get(sql, code).get("SecuAbbr")
        return name

    def get_redit_link(self, link):
        """获取最终重定向后的网址"""
        resp = self.my_get(link)
        redit_list = resp.history
        try:
            redit_link = redit_list[len(redit_list) - 1].headers["location"]
        except IndexError:
            redit_link = link
        except:
            return None
        return redit_link

    @retry(stop_max_attempt_number=5)
    def my_get(self, link):
        # 请求 超时\被ban 时重试
        logger.debug(f'get in {link} .. ')
        resp = requests.get(link, headers=self.headers, timeout=5)
        return resp

    def parse(self, url: str, day_string: str):
        items = []
        resp = self.my_get(url)
        if resp and resp.status_code == 200:
            text = resp.text
            datas = json.loads(text)
            if not datas:
                print("{} 无公告数据".format(url))
            else:
                for data in datas:
                    item = {}
                    # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称
                    announcementTime = time.localtime(
                        int(data.get("announcementTime") / 1000))
                    announcementTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                     announcementTime)
                    item['pub_date'] = announcementTime
                    item['type'] = data.get("type")
                    item['title'] = data.get("title")
                    page_path = data.get("pagePath")
                    if page_path is None:
                        link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format(
                            data.get("textId"), day_string)
                    else:
                        try:
                            link = self.get_redit_link(page_path)
                        except:
                            link = None
                    if not link:
                        continue
                    item['link'] = link
                    code = data.get("code")
                    if code:
                        item['code'] = code  # 无前缀的证券代码
                        item['name'] = self.get_secu_abbr(code)
                        items.append(item)
        return items

    def start(self,
              start_dt: datetime.datetime = None,
              end_dt: datetime.datetime = None):
        """启动入口"""
        # 使用传入的结束时间否则置为今天
        if end_dt is not None:
            end_dt = datetime.datetime.combine(end_dt, datetime.time.min)
        else:
            end_dt = datetime.datetime.combine(datetime.datetime.now(),
                                               datetime.time.min)

        # 使用传入的开始时间 否则置为结束时间之前 3 天
        if start_dt is not None:
            start_dt = datetime.datetime.combine(start_dt, datetime.time.min)
        else:
            start_dt = end_dt - datetime.timedelta(days=3)

        # 从结束时间向前 按天 遍历到开始时间 遍历列表包含起止时间
        this_day = end_dt
        while this_day >= start_dt:
            day_string = this_day.strftime("%Y-%m-%d")
            items = self.parse(self.api_url.format(day_string), day_string)
            logger.info(f"{day_string}, {len(items)}")
            self._spider_conn.batch_insert(items, self.table_name, self.fields)
            this_day -= datetime.timedelta(days=1)
            time.sleep(3)
예제 #4
0
class SourceAnnouncementBase(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base2'
        self.his_table = 'juchao_ant2'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = Connection(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            database=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )

    def category_code_map(self):
        return {
            'category_bcgz_szsh': ('补充更正', 19),
            'category_bndbg_szsh': ('半年报', 2),
            'category_cqdq_szsh': ('澄清致歉', 12),
            'category_dshgg_szsh': ('董事会', 18),
            'category_fxts_szsh': ('风险提示', 21),
            'category_gddh_szsh': ('股东大会', 15),
            'category_gqbd_szsh': ('股权变动', 16),
            'category_gqjl_szsh': ('股权激励', 17),
            'category_gszl_szsh': ('公司治理', 24),
            'category_gszq_szsh': ('公司债', 25),
            'category_jj_szsh': ('解禁', 9),
            'category_jshgg_szsh': ('监事会', 14),
            'category_kzzq_szsh': ('可转债', 22),
            'category_ndbg_szsh': ('年报', 4),
            'category_pg_szsh': ('配股', 7),
            'category_qtrz_szsh': ('其他融资', 23),
            'category_qyfpxzcs_szsh': ('权益分派', 11),
            'category_rcjy_szsh': ('日常经营', 10),
            'category_sf_szsh': ('首发', 8),
            'category_sjdbg_szsh': ('三季报', 3),
            'category_tbclts_szsh': ('特别处理和退市', 13),
            'category_tszlq_szsh': ('退市整理期', 20),
            'category_yjdbg_szsh': ('一季报', 1),
            'category_yjygjxz_szsh': ('业绩预告', 5),
            'category_zf_szsh': ('增发', 6),
            'category_zj_szsh': ('中介报告', 26),
            'category_others': ('其他', 27),
        }

    def spider_max_id(self, table_name: str):
        sql = f'''select max(id) as max_id from {table_name}; '''
        max_id = self._r_spider_conn.get(sql).get("max_id")
        return max_id

    def check_ids(self):
        _delta = []
        max_id = self.spider_max_id(self.his_table)
        for i in range(int(max_id / self.batch_number) + 1):
            begin_id = i * self.batch_number
            end_id = i * self.batch_number + self.batch_number
            print()
            print(begin_id, end_id)
            sql1 = '''select count(id) from {} where id between {} and {}; '''.format(
                self.his_table, begin_id, end_id)
            sql2 = '''select count(id) from {} where id between {} and {}; '''.format(
                self.merge_table_name, begin_id, end_id)
            count1 = self._r_spider_conn.get(sql1).get("count(id)")
            count2 = self._r_spider_conn.get(sql2).get("count(id)")
            print(count1, count2)
            if count1 != count2:
                sl1 = '''select id from {} where id between {} and {};'''.format(
                    self.his_table, begin_id, end_id)
                sl2 = '''select id from {} where id between {} and {};'''.format(
                    self.merge_table_name, begin_id, end_id)
                rs1 = self._r_spider_conn.query(sl1)
                ids1 = set([r.get("id") for r in rs1])
                rs2 = self._r_spider_conn.query(sl2)
                ids2 = set([r.get("id") for r in rs2])
                print(list(ids1 - ids2))
                _delta.extend(list(ids1 - ids2))

        print(_delta)
        return _delta

    def process_diff_ids(self, diff_lst):
        diff_lst = tuple(diff_lst)
        category_code_map = self.category_code_map()
        _sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from  {} where id in {}; '''.format(
            self.his_table, diff_lst)
        datas = self._r_spider_conn.query(_sql)
        print(len(datas))
        for data in datas:
            data['CategoryCode'] = category_code_map.get(
                data.get("CategoryCode"))[1]
            data = utils.process_secucode(data)  # 暂时只要 0 3 6
            if data:
                print(data)
                ret = self._spider_conn.table_insert(
                    self.merge_table_name, data, [
                        'SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1',
                        'InsertDatetime1', 'Title1'
                    ])
                print(ret)

    def first_load(self):
        inner_map = self.load_innercode_map()
        category_code_map = self.category_code_map()

        load_sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from  {} where \
id >= {} and id < {}; '''
        max_id = self.spider_max_id(self.his_table)
        for i in range(int(max_id / self.batch_number) + 1):
            begin_id = i * self.batch_number
            end_id = i * self.batch_number + self.batch_number
            print(begin_id, end_id)
            _sql = load_sql.format(self.his_table, begin_id, end_id)
            print(_sql)
            datas = self._r_spider_conn.query(_sql)
            for data in datas:
                inner_code = inner_map.get(data.get("SecuCode"))
                if inner_code:
                    data['InnerCode'] = inner_code  # 爬虫库中的 secucode 无前缀
                else:
                    inner_code = self.get_old_inner_code(data.get("SecuCode"))
                    if inner_code:
                        data['InnerCode'] = inner_code
                        inner_map.update({data.get("SecuCode"): inner_code})
                    else:
                        continue

                data['CategoryCode'] = category_code_map.get(
                    data.get("CategoryCode"))[1]
                data = utils.process_secucode(data)  # 暂时只要 0 3 6
                if data:
                    self._spider_conn.table_insert(
                        self.merge_table_name, data, [
                            'SecuCode', 'SecuAbbr', 'CategoryCode',
                            'PubDatetime1', 'InsertDatetime1', 'Title1',
                            'InnerCode'
                        ])

        # 将巨潮快讯的数据补充进去
        update_sql = '''select A.* from {} A, {} B \
where A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告' \
and A.id between {} and  {}; '''
        max_id = self.spider_max_id(self.live_table)
        for j in range(int(max_id / self.batch_number) + 1):
            begin_id = j * self.batch_number
            end_id = j * self.batch_number + self.batch_number
            print(begin_id, end_id)
            _sql = update_sql.format(self.live_table, self.his_table, begin_id,
                                     end_id)
            print(_sql)
            datas = self._r_spider_conn.query(_sql)
            for data in datas:
                item = {
                    'PubDatetime2': data.get("pub_date"),
                    'InsertDatetime2': data.get("CREATETIMEJZ"),
                    'Title2': data.get("title"),
                }
                self._spider_conn.table_update(self.merge_table_name, item,
                                               'PDFLink', data.get("link"))

    def load_innercode_map(self):
        sql = '''select secucode, innercode from secumain; '''
        _map = dict()
        ret = self._juyuan_conn.query(sql)
        for r in ret:
            _map[r.get("secucode")] = r.get("innercode")
        return _map

    def get_old_inner_code(self, secucode: str):
        sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format(
            secucode)
        r = self._juyuan_conn.get(sql)
        if r:
            inner_code = r.get("InnerCode")
            return inner_code
예제 #5
0
class SourceAnnouncementBase(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base2'
        self.his_table = 'juchao_ant2'
        self.live_table = 'juchao_kuaixun'
        self.batch_number = 10000

        self._r_spider_conn = Connection(
            host=R_SPIDER_MYSQL_HOST,
            port=R_SPIDER_MYSQL_PORT,
            user=R_SPIDER_MYSQL_USER,
            password=R_SPIDER_MYSQL_PASSWORD,
            database=R_SPIDER_MYSQL_DB,
        )
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )

    def category_code_map(self):
        return {
         'category_bcgz_szsh': ('补充更正', 19),
         'category_bndbg_szsh': ('半年报', 2),
         'category_cqdq_szsh': ('澄清致歉', 12),
         'category_dshgg_szsh': ('董事会', 18),
         'category_fxts_szsh': ('风险提示', 21),
         'category_gddh_szsh': ('股东大会', 15),
         'category_gqbd_szsh': ('股权变动', 16),
         'category_gqjl_szsh': ('股权激励', 17),
         'category_gszl_szsh': ('公司治理', 24),
         'category_gszq_szsh': ('公司债', 25),
         'category_jj_szsh': ('解禁', 9),
         'category_jshgg_szsh': ('监事会', 14),
         'category_kzzq_szsh': ('可转债', 22),
         'category_ndbg_szsh': ('年报', 4),
         'category_pg_szsh': ('配股', 7),
         'category_qtrz_szsh': ('其他融资', 23),
         'category_qyfpxzcs_szsh': ('权益分派', 11),
         'category_rcjy_szsh': ('日常经营', 10),
         'category_sf_szsh': ('首发', 8),
         'category_sjdbg_szsh': ('三季报', 3),
         'category_tbclts_szsh': ('特别处理和退市', 13),
         'category_tszlq_szsh': ('退市整理期', 20),
         'category_yjdbg_szsh': ('一季报', 1),
         'category_yjygjxz_szsh': ('业绩预告', 5),
         'category_zf_szsh': ('增发', 6),
         'category_zj_szsh': ('中介报告', 26),
         'category_others': ('其他', 27),
         }

    def load_innercode_map(self):
        sql = '''select secucode, innercode from secumain; '''
        _map = dict()
        ret = self._juyuan_conn.query(sql)
        for r in ret:
            _map[r.get("secucode")] = r.get("innercode")
        return _map

    def get_old_inner_code(self, secucode: str):
        sql = '''select InnerCode from LC_CodeChange where secucode = '{}';'''.format(secucode)
        r = self._juyuan_conn.get(sql)
        if r:
            inner_code = r.get("InnerCode")
            return inner_code

    def daily_update(self, deadline: datetime.datetime = None):
        inner_map = self.load_innercode_map()
        category_code_map = self.category_code_map()

        if deadline is None:
            deadline = datetime.datetime.now() - datetime.timedelta(days=1)

        load_sql = '''select id, SecuCode, CategoryCode, SecuAbbr, AntTime as PubDatetime1, \
AntTitle as Title1, AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from {} where \
UPDATETIMEJZ > '{}'; '''.format(self.his_table, deadline)
        logger.info(f"his load sql is {load_sql}")
        datas = self._r_spider_conn.query(load_sql)
        logger.info(f"load count is {len(datas)} from his table.")
        items = []
        for data in datas:
            data['CategoryCode'] = category_code_map.get(data.get("CategoryCode"))[1]
            inner_code = inner_map.get(data.get("SecuCode"))
            if inner_code:
                data['InnerCode'] = inner_code    # 爬虫库中的 secucode 无前缀
            else:
                inner_code = self.get_old_inner_code(data.get("SecuCode"))
                if inner_code:
                    inner_map.update({data.get("SecuCode"): inner_code})
                    data['InnerCode'] = inner_code
                else:
                    continue
            data = utils.process_secucode(data)  # 暂时只要 0 3 6
            if data:
                items.append(data)

        self._spider_conn.batch_insert(items, self.merge_table_name,
        ['SecuCode', 'SecuAbbr', 'CategoryCode', 'PubDatetime1', 'InsertDatetime1', 'Title1', 'InnerCode'])

        update_sql = '''select A.* from juchao_kuaixun A, juchao_ant B where B.UPDATETIMEJZ > '{}' \
and A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告';  '''.format(deadline)
        logger.info(f"live sql is {update_sql}")
        datas = self._r_spider_conn.query(update_sql)
        logger.info(f'load count {len(datas)} from live table.')
        for data in datas:
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._spider_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))