Beispiel #1
0
class FinalConstAnn(object):
    def __init__(self):
        self.dc_table_name = 'stk_quot_idx'  # 日行情指标表 从 datacenter 数据库中获取
        self.target_table_name = 'sf_const_announcement'  # 目标数据库: 公告事件常量表
        self.source_table_name = 'dc_ann_event_source_ann_detail'  # 源数据库: 公告明细表
        self.tool_table_name = 'secumain'
        self.process_table_name = 'const_process_records'
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        # 数据中心读数据库
        self._dc_conn = Connection(
            host=DC_HOST,
            port=DC_PORT,
            user=DC_USER,
            password=DC_PASSWD,
            database=DC_DB,
        )

        self.today = datetime.datetime.now()
        self.today_of_lastyear = self.today - datetime.timedelta(days=365)
        self.innercode_map = None
        self.temp_fields = [
            'event_code',
            'code',
            'date',
            'd1rate',
            'd2rate',
            'd3rate',
            'd4rate',
            'd5rate',
            'd1acc',
            'd2acc',
            'd3acc',
            'd5acc',
        ]

    def innercode_map_init(self):
        if not self.innercode_map:
            self.innercode_map = self.get_inner_code_map()

    def get_inner_code_map(self):
        sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format(
            self.tool_table_name, self.source_table_name)
        __map = {}
        try:
            res = self._yuqing_conn.query(sql)
            for r in res:
                __map[r.get("SecuCode")] = r.get("InnerCode")
            return __map
        except:
            traceback.print_exc()
            return {}

    def const_event_codes(self):
        sql = '''select distinct(EventCode) from {};'''.format(
            self.target_table_name)
        try:
            res = self._yuqing_conn.query(sql)
            event_codes = [one.get("EventCode") for one in res]
            return event_codes
        except:
            traceback.print_exc()
            return []

    def get_event_detail(self, event_code: str):
        sql = '''select PubTime, SecuCode from {} where EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format(
            self.source_table_name, event_code, self.today_of_lastyear,
            self.today)
        print(sql)
        try:
            res = self._yuqing_conn.query(sql)
            return res
        except:
            traceback.print_exc()
            return []

    def get_fivedays_changepercactual(self, secucode: str,
                                      dt: datetime.datetime):
        innercode = self.innercode_map.get(secucode)
        if not innercode:
            return []
        sql = '''select Date, ChangePercActual from {} where InnerCode = '{}' and Date >= '{}' \
order by Date limit 5;'''.format(self.dc_table_name, innercode, dt)
        try:
            res = self._dc_conn.query(sql)
            return res
        except:
            traceback.print_exc()
            return []

    def generate_changepercactual_index(self, index_datas):
        """计算单只证券的 次\3\5日 累计涨幅"""
        cpt_scores = [
            float(data.get("ChangePercActual")) / 100 for data in index_datas
        ]
        # print("&&&&&", cpt_scores)

        days_len = len(cpt_scores)
        x = y = z = m = n = None

        if days_len == 0 or days_len == 1:
            return []

        elif days_len == 2:
            x, y = cpt_scores
        elif days_len == 3:
            x, y, z = cpt_scores
        elif days_len == 4:
            x, y, z, m = cpt_scores
        else:
            x, y, z, m, n = cpt_scores
        # print("*****", x, y, z, m, n)
        ret1 = ret2 = ret3 = None
        # 次日累计涨幅
        ret1 = (1 + x) * (1 + y) - 1
        # 3 日累计涨幅
        if z is not None:
            ret2 = (1 + x) * (1 + y) * (1 + z) - 1
            if m is not None and n is not None:
                # 5 日累计涨幅
                ret3 = (1 + x) * (1 + y) * (1 + z) * (1 + m) * (1 + n) - 1
        # print(">>>>>", ret1, ret2, ret3)
        return [ret1 * 100, ret2 * 100, ret3 * 100]

    def crate_middle_process_table(self):
        """记录中间数据的临时表, 方便校对"""
        sql = '''
CREATE TABLE IF NOT EXISTS `const_process_records`(
   `event_code` varchar(50) NOT NULL COMMENT '事件代码',
   `code` varchar(50) NOT NULL COMMENT '事件代码',
   `date` datetime NOT NULL COMMENT '日期',
   `d1rate` decimal(10,4) NOT NULL,
   `d2rate` decimal(10,4) NOT NULL,
   `d3rate` decimal(10,4) NOT NULL,
   `d4rate` decimal(10,4) NOT NULL,
   `d5rate` decimal(10,4) NOT NULL,
   `d1acc` decimal(10,4) NOT NULL,
   `d2acc` decimal(10,4) NOT NULL,
   `d3acc` decimal(10,4) NOT NULL,
   `d5acc` decimal(10,4) NOT NULL,
  `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
  `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
   UNIQUE KEY `u1` (`event_code`,`code`, `date`) 
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
        '''
        print(sql)
        self._yuqing_conn.insert(sql)

    def process_single_eventcode(self, eventcode: str):
        print("NOW IS:", eventcode)
        record = {}
        record["EventCode"] = eventcode
        # (3) 对于某一个事件来说, 近一年发生该事件的证券以及发生时间列表
        event_detail_info = self.get_event_detail(eventcode)
        print(f"EVENT DETAIL COUNT: {len(event_detail_info)}")
        event_count = len(event_detail_info)
        if event_count == 0:
            print(f"NO RVENT OF CODE {eventcode}")
            return None
        winlist_onday = []  # 针对事件全部股票列表在时间发生时间的当日胜率与次日胜率
        winlist_nextday = []

        total_rate = 0
        total_rate2 = 0  # 针对事件全部股票列表在时间发生时间的次\3\5日平均涨幅
        total_rate3 = 0
        total_rate5 = 0

        for one in event_detail_info:
            happen_dt = one.get("PubTime")
            secuCode = one.get("SecuCode")
            temp_record = dict()
            # (4) 获取单只证券在发生时间后(包括当日)的5日涨幅
            fiveday_rateinfo = self.get_fivedays_changepercactual(
                secuCode, happen_dt)

            if fiveday_rateinfo == list() or len(fiveday_rateinfo) != 5:
                print(f"{secuCode} - {happen_dt} 5 日数据不足")
                continue
            temp_record['event_code'] = eventcode
            temp_record['code'] = secuCode
            temp_record['date'] = happen_dt
            temp_record['d1rate'] = fiveday_rateinfo[0].get(
                "ChangePercActual")  # type Decimal
            temp_record['d2rate'] = fiveday_rateinfo[1].get("ChangePercActual")
            temp_record['d3rate'] = fiveday_rateinfo[2].get("ChangePercActual")
            temp_record['d4rate'] = fiveday_rateinfo[3].get("ChangePercActual")
            temp_record['d5rate'] = fiveday_rateinfo[4].get("ChangePercActual")

            winlist_onday.append(
                float(fiveday_rateinfo[0].get("ChangePercActual")))
            winlist_nextday.append(
                float(fiveday_rateinfo[1].get("ChangePercActual")))
            # (5) 计算单只证券的 次\3\5日 累计涨幅
            accumulated_rate2, accumulated_rate3, accumulated_rate5 = self.generate_changepercactual_index(
                fiveday_rateinfo)
            total_rate2 += accumulated_rate2
            total_rate3 += accumulated_rate3
            total_rate5 += accumulated_rate5
            total_rate += float(fiveday_rateinfo[0].get("ChangePercActual"))

            temp_record['d1acc'] = temp_record['d1rate']
            temp_record['d2acc'] = accumulated_rate2
            temp_record['d3acc'] = accumulated_rate3
            temp_record['d5acc'] = accumulated_rate5
            self._yuqing_conn.table_insert(self.process_table_name,
                                           temp_record)

        # (6) 计算当日胜率
        on_count = 0
        for rate in winlist_onday:
            if rate > 0:
                on_count += 1
        onday_winrate = on_count / len(winlist_onday)
        # (7) 计算次日胜率
        next_count = 0
        for rate in winlist_nextday:
            if rate > 0:
                next_count += 1
        nextday_winrate = next_count / len(winlist_nextday)
        # (8) 计算平均涨幅
        average_rate = total_rate / event_count
        average_rate2 = total_rate2 / event_count
        average_rate3 = total_rate3 / event_count
        average_rate5 = total_rate5 / event_count

        record['EventDayChgPerc'] = average_rate
        record['NextDayChgPerc'] = average_rate2
        record['ThreeDayChgPerc'] = average_rate3
        record['FiveDayChgPerc'] = average_rate5
        record['EventDayWinRatio'] = onday_winrate
        record['NextDayWinRatio'] = nextday_winrate
        print(pprint.pformat(record))
        if record is not None:
            self.save_record(record)
        return record

    def save_record(self, record):
        item = copy.deepcopy(record)
        item.pop("EventCode")
        self._yuqing_conn.table_update(self.target_table_name, item,
                                       "EventCode", record.get("EventCode"))

    def launch(self):
        self.crate_middle_process_table()
        self.innercode_map_init()

        eventcode_lst = sorted(self.const_event_codes())
        for event_code in eventcode_lst:
            print(event_code)
            self.process_single_eventcode(event_code)
class JuchaoFinanceSpider(object):
    """巨潮历史公告爬虫 """
    def __init__(self):
        self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
        self.fields = ['SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc']
        self.headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.cninfo.com.cn',
            'Origin': 'http://www.cninfo.com.cn',
            'Pragma': 'no-cache',
            'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        }
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self.history_table_name = 'juchao_ant_finance'  # 巨潮历史公告(财务相关)表

    def _create_table(self):
        create_sql = '''
         CREATE TABLE IF NOT EXISTS `juchao_ant_finance` (
          `id` int(11) NOT NULL AUTO_INCREMENT,
          `SecuCode` varchar(8) NOT NULL COMMENT '证券代码',
          `SecuAbbr` varchar(16) NOT NULL COMMENT '证券代码',
          `Category` varchar(32) NOT NULL COMMENT '财报类型',
          `AntId` int(20) NOT NULL COMMENT '巨潮自带公告 ID',
          `AntTime` datetime NOT NULL COMMENT '发布时间',
          `AntTitle` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯标题',
          `AntDoc` varchar(256) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '公告详情页链接',
          `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
          `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
          PRIMARY KEY (`id`),
          UNIQUE KEY `ant_id` (`AntId`),
          KEY `ant_time` (`AntTime`),
          KEY `secucode` (`SecuCode`),
          KEY `update_time` (`UPDATETIMEJZ`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨潮个股财务类公告' ; 
        '''
        self._spider_conn.insert(create_sql)

    @retrying.retry(stop_max_attempt_number=10)
    def my_post(self, link: str, post_data):
        resp = requests.post(link, headers=self.headers, data=post_data, timeout=10)
        return resp

    def start(self, start_date=None, end_date=None):
        '''网站限制 一次最多只有一年的数据'''
        self._create_table()

        if start_date is None:
            start_date = datetime.datetime.today() - datetime.timedelta(days=10)
        if end_date is None:
            end_date = datetime.datetime.today()

        se_date = "{}~{}".format(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
        print(se_date)

        for page in range(10):
            print("page >> {}".format(page))
            for category in ('category_ndbg_szsh',
                             'category_bndbg_szsh',
                             'category_yjdbg_szsh',
                             'category_sjdbg_szsh',
                             'category_yjygjxz_szsh',
                             # 'category_yjkb_szsh',    # 自定义业绩快报类型
                             ):
                post_data = {
                    'pageNum': page,
                    'pageSize': 30,
                    'column': 'szse',
                    'tabName': 'fulltext',
                    'plate': '',
                    'stock': '',
                    'searchkey': '',
                    'secid': '',
                    # 财务包含以下几个版块: 年报、半年报、一季报、三季报、业绩预告
                    # 年报: category_ndbg_szsh
                    # 半年报: category_bndbg_szsh
                    # 一季报: category_yjdbg_szsh
                    # 三季报: category_sjdbg_szsh
                    # 业绩预告: category_yjygjxz_szsh
                    # 'category': 'category_ndbg_szsh;category_bndbg_szsh;category_yjdbg_szsh;category_sjdbg_szsh;category_yjygjxz_szsh',
                    'category': category,
                    'trade': '',
                    'seDate': se_date,
                    'sortName': '',
                    'sortType': '',
                    'isHLtitle': True,
                }
                # resp = requests.post(self.api, headers=self.headers, data=post_data, timeout=3)
                resp = self.my_post(self.api, post_data)
                if resp.status_code == 200:
                    text = resp.text
                    if text == '':
                        break

                    py_datas = json.loads(text)
                    ants = py_datas.get("announcements")
                    if ants is None:
                        break

                    for ant in ants:
                        item = dict()
                        item['SecuCode'] = ant.get('secCode')
                        item['SecuAbbr'] = ant.get('secName')
                        item['AntId'] = ant.get("announcementId")
                        item['AntTitle'] = ant.get("announcementTitle")
                        item['Category'] = category
                        time_stamp = ant.get("announcementTime") / 1000
                        item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)})
                        item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")})
                        # print(item)
                        self._spider_conn.table_insert(self.history_table_name, item)
                else:
                    print(resp)

        # 搜索业绩快报关键字
        # searchkey: 业绩快报
        for page in range(10):
            print("page >> {}".format(page))
            post_data = {
                'pageNum': page,
                'pageSize': 30,
                'column': 'szse',
                'tabName': 'fulltext',
                'plate': '',
                'stock': '',
                'searchkey': '业绩快报',
                'secid': '',
                'category': '',
                'trade': '',
                'seDate': se_date,
                'sortName': '',
                'sortType': '',
                'isHLtitle': True,
            }
            resp = self.my_post(self.api, post_data)
            if resp.status_code == 200:
                text = resp.text
                if text == '':
                    break

                py_datas = json.loads(text)
                ants = py_datas.get("announcements")
                if ants is None:
                    break

                for ant in ants:
                    item = dict()
                    item['SecuCode'] = ant.get('secCode')
                    item['SecuAbbr'] = ant.get('secName')
                    item['AntId'] = ant.get("announcementId")
                    item['AntTitle'] = ant.get("announcementTitle")
                    item['Category'] = 'category_yjkb_szsh'
                    time_stamp = ant.get("announcementTime") / 1000
                    item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)})
                    item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")})
                    print(item)
                    self._spider_conn.table_insert(self.history_table_name, item)
            else:
                print(resp)