Beispiel #1
0
    def __init__(self,
                 start_time: datetime.datetime = None,
                 end_time: datetime.datetime = None,
                 ):
        self.source_table = 'dc_ann_event_source_ann_detail'
        self.target_table = 'sf_secu_announcement_detail'
        self.tool_table = "secumain"
        self.trading_table = 'tradingday'
        self.codes_map = {}
        self.industry_map = {}
        self.med_inf_map = {}

        self.start_time = start_time
        self.end_time = end_time
        self.trading_days = None
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        # 主题猎手数据库
        self._theme_conn = Connection(
            host=THE_HOST,
            port=THE_PORT,
            user=THE_USER,
            password=THE_PASSWD,
            database=THE_DB,
        )
 def __init__(self):
     self.source_table = 'dc_ann_event_source_ann_detail'
     self.target_table = 'sf_secu_announcement_summary'
     self.tool_table = "secumain"
     self.trading_table = 'tradingday'
     self.const_table = 'sf_const_announcement'
     self.codes_map = {}
     self.sent_map = {}
     self._yuqing_conn = Connection(
         host=YQ_HOST,
         port=YQ_PORT,
         user=YQ_USER,
         password=YQ_PASSWD,
         database=YQ_DB,
     )
Beispiel #3
0
 def __init__(self):
     self.merge_table_name = 'announcement_base'
     self.batch_number = 1000
     self._yuqing_conn = Connection(
         host=YQ_HOST,
         port=YQ_PORT,
         user=YQ_USER,
         password=YQ_PASSWD,
         database=YQ_DB,
     )
     self._r_spider_conn = Connection(
         host=SPIDER_MYSQL_HOST2,
         port=SPIDER_MYSQL_PORT2,
         user=SPIDER_MYSQL_USER2,
         password=SPIDER_MYSQL_PASSWORD2,
         database=SPIDER_MYSQL_DB2,
     )
Beispiel #4
0
    def __init__(self):
        self.dc_table_name = 'stk_quot_idx'  # 日行情指标表 从 datacenter 数据库中获取
        self.target_table_name = 'sf_const_announcement'  # 目标数据库: 公告事件常量表
        self.source_table_name = 'dc_ann_event_source_ann_detail'  # 源数据库: 公告明细表
        self.tool_table_name = 'secumain'
        self.process_table_name = 'const_process_records'
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        # 数据中心读数据库
        self._dc_conn = Connection(
            host=DC_HOST,
            port=DC_PORT,
            user=DC_USER,
            password=DC_PASSWD,
            database=DC_DB,
        )

        self.today = datetime.datetime.now()
        self.today_of_lastyear = self.today - datetime.timedelta(days=365)
        self.innercode_map = None
        self.temp_fields = [
            'event_code',
            'code',
            'date',
            'd1rate',
            'd2rate',
            'd3rate',
            'd4rate',
            'd5rate',
            'd1acc',
            'd2acc',
            'd3acc',
            'd5acc',
        ]
Beispiel #5
0
 def __init__(self):
     self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
     self.fields = [
         'SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc'
     ]
     self.headers = {
         'Accept':
         '*/*',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'zh-CN,zh;q=0.9,en;q=0.8',
         'Cache-Control':
         'no-cache',
         'Connection':
         'keep-alive',
         'Content-Type':
         'application/x-www-form-urlencoded; charset=UTF-8',
         'Host':
         'www.cninfo.com.cn',
         'Origin':
         'http://www.cninfo.com.cn',
         'Pragma':
         'no-cache',
         'Referer':
         'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
     }
     self._spider_conn = Connection(
         host=SPIDER_MYSQL_HOST,
         port=SPIDER_MYSQL_PORT,
         user=SPIDER_MYSQL_USER,
         password=SPIDER_MYSQL_PASSWORD,
         database=SPIDER_MYSQL_DB,
     )
     self.history_table_name = 'juchao_ant'  # 巨潮历史公告表
 def __init__(self):
     self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
     self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
     self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
     self.table_name = 'juchao_kuaixun'
     self.name = '巨潮快讯'
     self._spider_conn = Connection(
         host=SPIDER_MYSQL_HOST,
         port=SPIDER_MYSQL_PORT,
         user=SPIDER_MYSQL_USER,
         password=SPIDER_MYSQL_PASSWORD,
         database=SPIDER_MYSQL_DB,
     )
     self._juyuan_conn = Connection(
         host=JUY_HOST,
         port=JUY_PORT,
         user=JUY_USER,
         password=JUY_PASSWD,
         database=JUY_DB,
     )
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
                       'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
     }
Beispiel #7
0
class SourceAnnouncementBase(object):
    """将两个爬虫表合并生成公告基础表"""
    def __init__(self):
        self.merge_table_name = 'announcement_base'
        self.batch_number = 1000
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        self._r_spider_conn = Connection(
            host=SPIDER_MYSQL_HOST2,
            port=SPIDER_MYSQL_PORT2,
            user=SPIDER_MYSQL_USER2,
            password=SPIDER_MYSQL_PASSWORD2,
            database=SPIDER_MYSQL_DB2,
        )

    def launch(self):
        deadline = datetime.datetime.now() - datetime.timedelta(hours=5)

        load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, AntTitle as Title1, \
AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from juchao_ant where UPDATETIMEJZ > '{}'; '''.format(
            deadline)

        datas = self._r_spider_conn.query(load_sql)
        datas = [utils.process_secucode(data) for data in datas]
        datas = [data for data in datas if data is not None]
        for data in datas:
            print(data)
            self._yuqing_conn.table_insert(self.merge_table_name, data)

        update_sql = '''select A.* from juchao_kuaixun A, juchao_ant B where A.pub_date > '{}' \
and A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告';  '''.format(
            deadline)
        datas = self._r_spider_conn.query(update_sql)

        for data in datas:
            print(data)
            item = {
                'PubDatetime2': data.get("pub_date"),
                'InsertDatetime2': data.get("CREATETIMEJZ"),
                'Title2': data.get("title"),
            }
            self._yuqing_conn.table_update(self.merge_table_name, item,
                                           'PDFLink', data.get("link"))
class FinalAntSummary(object):
    def __init__(self):
        self.source_table = 'dc_ann_event_source_ann_detail'
        self.target_table = 'sf_secu_announcement_summary'
        self.tool_table = "secumain"
        self.trading_table = 'tradingday'
        self.const_table = 'sf_const_announcement'
        self.codes_map = {}
        self.sent_map = {}
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )

    def get_inner_code_map(self):
        sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format(
            self.tool_table,
            self.source_table,
        )
        ret = self._yuqing_conn.query(sql)
        for r in ret:
            self.codes_map[r.get('SecuCode')] = r.get('InnerCode')

    def get_sentiment_map(self):
        sql = '''select EventCode, Sentiment from {} ; '''.format(
            self.const_table)
        ret = self._yuqing_conn.query(sql)
        for r in ret:
            self.sent_map[r.get('EventCode')] = r.get("Sentiment")

    def get_nearest_trading_day(self, day: datetime.datetime):
        """获取距离当前时间最近的向前一个交易日 包括当前的日期"""
        while True:
            sql = '''select IfTradingDay from {} where Date = '{}' and SecuMarket = 83 ; '''.format(
                self.trading_table, day)
            is_trading_day = self._yuqing_conn.get(sql).get("IfTradingDay")
            if is_trading_day == 1:
                return day
            day -= datetime.timedelta(days=1)

    def get_update_endtime(self):
        sql = '''select max(PubTime) as max_pub from {};  '''.format(
            self.source_table)
        try:
            max_pub = self._yuqing_conn.get(sql).get("max_pub")
        except:
            max_pub = None
        return max_pub

    def get_update_starttime(self):
        sql = '''select max(TradeDate) as max_trd from {}; '''.format(
            self.target_table)
        try:
            max_trd = self._yuqing_conn.get(sql).get("max_trd")
        except:
            max_trd = None
        return max_trd

    def daily_update(self):
        update_end = self.get_update_endtime()
        update_start = self.get_update_starttime()
        print(update_start, update_end)

        self.get_inner_code_map()
        self.get_sentiment_map()

        sql = '''select AnnID, SecuCode, PubTime, EventCode, Title, PDFLink from {} where PubTime  between '{}' and '{}'; '''.format(
            self.source_table, update_start, update_end)
        datas = self._yuqing_conn.query(sql)
        for data in datas:
            ann_id = data.get("AnnID")
            pub_time = data.get("PubTime")
            event_code = data.get("EventCode")
            title = data.get("Title")
            link = data.get("PDFLink")
            secu_code = data.get("SecuCode")

            inner_code = self.codes_map.get(secu_code)
            pub_day = datetime.datetime(pub_time.year, pub_time.month,
                                        pub_time.day)
            trade_day = self.get_nearest_trading_day(pub_day)
            sentiment = self.sent_map.get(event_code)
            item = {}
            item['SecuCode'] = secu_code
            item['InnerCode'] = inner_code
            item['TradeDate'] = trade_day
            item['Sentiment'] = sentiment
            item['EventCode'] = event_code
            item['AnnID'] = ann_id
            item['AnnTitle'] = title
            item['Website'] = link
            print(item)
            self._yuqing_conn.table_insert(self.target_table, item)
Beispiel #9
0
class FinalAntDetail(object):
    def __init__(self,
                 start_time: datetime.datetime = None,
                 end_time: datetime.datetime = None,
                 ):
        self.source_table = 'dc_ann_event_source_ann_detail'
        self.target_table = 'sf_secu_announcement_detail'
        self.tool_table = "secumain"
        self.trading_table = 'tradingday'
        self.codes_map = {}
        self.industry_map = {}
        self.med_inf_map = {}

        self.start_time = start_time
        self.end_time = end_time
        self.trading_days = None
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        # 主题猎手数据库
        self._theme_conn = Connection(
            host=THE_HOST,
            port=THE_PORT,
            user=THE_USER,
            password=THE_PASSWD,
            database=THE_DB,
        )

    def get_inner_code_map(self):
        sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format(
            self.tool_table, self.source_table,
        )
        ret = self._yuqing_conn.query(sql)
        for r in ret:
            self.codes_map[r.get('SecuCode')] = r.get('InnerCode')

    def get_day(self, dt: datetime.datetime):
        return datetime.datetime(dt.year, dt.month, dt.day)

    def get_industry_code_map(self):
        sql = '''select A.code as IndustryCode, A.name as IndustryName, B.code as SecuCode, \
B.name as SecuAbbr from block A, block_code B where A.type = 1 and A.id = B.bid ;'''
        ret = self._theme_conn.query(sql)
        for r in ret:
            self.industry_map[r.get("SecuCode")] = r.get("IndustryCode")

    def get_all_trading_days(self, start_dt: datetime.datetime, end_dt:datetime.datetime):
        """
        获取起止时间之间的全部交易日并且排序
        :param start_dt:
        :param end_dt:
        :return:
        """
        sql = '''select Date from  {} where SecuMarket = 83 and IfTradingDay = 1 and Date between '{}' and '{}'; '''.format(
            self.trading_table, start_dt, end_dt
        )
        trading_days = self._yuqing_conn.query(sql)
        trading_days = sorted([trading_day.get("Date") for trading_day in trading_days])
        return trading_days

    def get_after_five_trading_days(self, dt: datetime.datetime):
        """当日(包括当日之后的)5 个交易日
        将一段时间内的全部交易日全部拿出来放在一个列表中排序
        取某一天的时间的索引(若不存在取之后最近的一个)然后顺次取之后第五个索引即可
        未击中时间范围时重新计算
        """
        while True:
            try:
                _index = self.trading_days.index(dt)
            except ValueError:
                dt += datetime.timedelta(days=1)
            else:
                break
        return self.trading_days[_index], self.trading_days[_index+4]

    def get_med_inf_map(self):
        sql = '''select MedName, InfluenceWeight from dc_const_media_info ; '''
        ret = self._yuqing_conn.query(sql)
        for r in ret:
            self.med_inf_map[r.get("MedName")] = int(r.get("InfluenceWeight"))

    def get_meds_scores(self, meds: list):
        """
        计算媒体得分 得分有 1 3 10 100
        无名媒体(None) 计 1 分
        :param meds:
        :return:
        """
        total_score = 0

        r_meds = []
        for med in meds:
            if med is None:
                total_score += 1
            else:
                r_meds.append(med)

        if len(r_meds) != 0:
            scores = [self.med_inf_map.get(r_med, 1) for r_med in r_meds]
            total_score += sum(scores)
        return total_score

    def get_news_num(self, secu_code: str, event_code: str, min_trading_day: datetime.datetime,
                     max_trading_day: datetime.datetime):
        """
        统计新闻发布时间在公告发布时间之后的所有关联篇数, 最多统计发布日之后的所有关联篇数,
        最多统计发布日(包括当日)之后的 5 个交易日之间的新闻,超过 5 个交易日就不需要去更新这条记录了.
        select * from dc_ann_event_source_news_detail A where A.SecuCode = '{}' and A.EventCode = '{}'
        and PubTime between {} amd {} ;
        """
        sql = '''select MedName from dc_ann_event_source_news_detail where SecuCode = '{}' \
and EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format(
            secu_code, event_code, min_trading_day, max_trading_day)
        datas = self._yuqing_conn.query(sql)
        count = len(datas)
        total_scores = 0
        if count != 0:
            meds = [data.get("MedName") for data in datas]  # dict
            total_scores = self.get_meds_scores(meds)
        return count, total_scores

    def get_post_num(self, secu_code: str, event_code: str, min_trading_day: datetime.datetime,
                     max_trading_day: datetime.datetime):
        sql = '''select count(*) as count from dc_ann_event_source_guba_detail where SecuCode = '{}' \
and EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format(
            secu_code, event_code, min_trading_day, max_trading_day)
        count = self._yuqing_conn.get(sql).get("count")
        return count

    def process_data(self, data: dict):
        secu_code = data.get("SecuCode")
        inner_code = self.codes_map.get(secu_code)
        event_code = data.get("EventCode")
        pub_time = data.get("PubTime")
        pub_date = self.get_day(pub_time)
        link = data.get("PDFLink")
        industry_code = self.industry_map.get(secu_code)
        min_trading_day, max_trading_day = self.get_after_five_trading_days(pub_date)
        item = {}
        item['SecuCode'] = secu_code
        item['InnerCode'] = inner_code
        item['EventCode'] = event_code
        item['PubDatetime'] = pub_time
        item['PubDate'] = pub_date
        item['Website'] = link
        item['IndustryCode'] = industry_code
        news_num, scores = self.get_news_num(secu_code, event_code, min_trading_day, max_trading_day)
        item['NewsNum'] = news_num
        item['Influence'] = scores
        item['PostNum'] = self.get_post_num(secu_code, event_code, min_trading_day, max_trading_day)
        return item

    def get_update_start(self):
        try:
            sql = '''select max(PubDatetime) as max_dt from {} ; '''.format(self.target_table)
            max_dt = self._yuqing_conn.get(sql).get("max_dt")
        except:
            max_dt = None
        return max_dt

    def get_update_end(self):
        return datetime.datetime.now()

    def daily_update(self):
        update_start = self.get_update_start()
        update_start = update_start - datetime.timedelta(days=7)
        update_end = self.get_update_end()
        print(update_start, update_end)

        self.get_inner_code_map()
        self.get_industry_code_map()
        self.get_med_inf_map()

        if not self.trading_days:
            self.trading_days = self.get_all_trading_days(update_start, update_end+datetime.timedelta(days=30))

        sql = '''select SecuCode, EventCode, PubTime, PDFLink from {} where PubTime between '{}' and '{}'; '''.format(
            self.source_table, update_start, update_end)
        datas = self._yuqing_conn.query(sql)
        for data in datas:
            item = self.process_data(data)
            print(item)
            self._yuqing_conn.table_insert(self.target_table, item)
Beispiel #10
0
class FinalConstAnn(object):
    def __init__(self):
        self.dc_table_name = 'stk_quot_idx'  # 日行情指标表 从 datacenter 数据库中获取
        self.target_table_name = 'sf_const_announcement'  # 目标数据库: 公告事件常量表
        self.source_table_name = 'dc_ann_event_source_ann_detail'  # 源数据库: 公告明细表
        self.tool_table_name = 'secumain'
        self.process_table_name = 'const_process_records'
        self._yuqing_conn = Connection(
            host=YQ_HOST,
            port=YQ_PORT,
            user=YQ_USER,
            password=YQ_PASSWD,
            database=YQ_DB,
        )
        # 数据中心读数据库
        self._dc_conn = Connection(
            host=DC_HOST,
            port=DC_PORT,
            user=DC_USER,
            password=DC_PASSWD,
            database=DC_DB,
        )

        self.today = datetime.datetime.now()
        self.today_of_lastyear = self.today - datetime.timedelta(days=365)
        self.innercode_map = None
        self.temp_fields = [
            'event_code',
            'code',
            'date',
            'd1rate',
            'd2rate',
            'd3rate',
            'd4rate',
            'd5rate',
            'd1acc',
            'd2acc',
            'd3acc',
            'd5acc',
        ]

    def innercode_map_init(self):
        if not self.innercode_map:
            self.innercode_map = self.get_inner_code_map()

    def get_inner_code_map(self):
        sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format(
            self.tool_table_name, self.source_table_name)
        __map = {}
        try:
            res = self._yuqing_conn.query(sql)
            for r in res:
                __map[r.get("SecuCode")] = r.get("InnerCode")
            return __map
        except:
            traceback.print_exc()
            return {}

    def const_event_codes(self):
        sql = '''select distinct(EventCode) from {};'''.format(
            self.target_table_name)
        try:
            res = self._yuqing_conn.query(sql)
            event_codes = [one.get("EventCode") for one in res]
            return event_codes
        except:
            traceback.print_exc()
            return []

    def get_event_detail(self, event_code: str):
        sql = '''select PubTime, SecuCode from {} where EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format(
            self.source_table_name, event_code, self.today_of_lastyear,
            self.today)
        print(sql)
        try:
            res = self._yuqing_conn.query(sql)
            return res
        except:
            traceback.print_exc()
            return []

    def get_fivedays_changepercactual(self, secucode: str,
                                      dt: datetime.datetime):
        innercode = self.innercode_map.get(secucode)
        if not innercode:
            return []
        sql = '''select Date, ChangePercActual from {} where InnerCode = '{}' and Date >= '{}' \
order by Date limit 5;'''.format(self.dc_table_name, innercode, dt)
        try:
            res = self._dc_conn.query(sql)
            return res
        except:
            traceback.print_exc()
            return []

    def generate_changepercactual_index(self, index_datas):
        """计算单只证券的 次\3\5日 累计涨幅"""
        cpt_scores = [
            float(data.get("ChangePercActual")) / 100 for data in index_datas
        ]
        # print("&&&&&", cpt_scores)

        days_len = len(cpt_scores)
        x = y = z = m = n = None

        if days_len == 0 or days_len == 1:
            return []

        elif days_len == 2:
            x, y = cpt_scores
        elif days_len == 3:
            x, y, z = cpt_scores
        elif days_len == 4:
            x, y, z, m = cpt_scores
        else:
            x, y, z, m, n = cpt_scores
        # print("*****", x, y, z, m, n)
        ret1 = ret2 = ret3 = None
        # 次日累计涨幅
        ret1 = (1 + x) * (1 + y) - 1
        # 3 日累计涨幅
        if z is not None:
            ret2 = (1 + x) * (1 + y) * (1 + z) - 1
            if m is not None and n is not None:
                # 5 日累计涨幅
                ret3 = (1 + x) * (1 + y) * (1 + z) * (1 + m) * (1 + n) - 1
        # print(">>>>>", ret1, ret2, ret3)
        return [ret1 * 100, ret2 * 100, ret3 * 100]

    def crate_middle_process_table(self):
        """记录中间数据的临时表, 方便校对"""
        sql = '''
CREATE TABLE IF NOT EXISTS `const_process_records`(
   `event_code` varchar(50) NOT NULL COMMENT '事件代码',
   `code` varchar(50) NOT NULL COMMENT '事件代码',
   `date` datetime NOT NULL COMMENT '日期',
   `d1rate` decimal(10,4) NOT NULL,
   `d2rate` decimal(10,4) NOT NULL,
   `d3rate` decimal(10,4) NOT NULL,
   `d4rate` decimal(10,4) NOT NULL,
   `d5rate` decimal(10,4) NOT NULL,
   `d1acc` decimal(10,4) NOT NULL,
   `d2acc` decimal(10,4) NOT NULL,
   `d3acc` decimal(10,4) NOT NULL,
   `d5acc` decimal(10,4) NOT NULL,
  `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
  `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
   UNIQUE KEY `u1` (`event_code`,`code`, `date`) 
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
        '''
        print(sql)
        self._yuqing_conn.insert(sql)

    def process_single_eventcode(self, eventcode: str):
        print("NOW IS:", eventcode)
        record = {}
        record["EventCode"] = eventcode
        # (3) 对于某一个事件来说, 近一年发生该事件的证券以及发生时间列表
        event_detail_info = self.get_event_detail(eventcode)
        print(f"EVENT DETAIL COUNT: {len(event_detail_info)}")
        event_count = len(event_detail_info)
        if event_count == 0:
            print(f"NO RVENT OF CODE {eventcode}")
            return None
        winlist_onday = []  # 针对事件全部股票列表在时间发生时间的当日胜率与次日胜率
        winlist_nextday = []

        total_rate = 0
        total_rate2 = 0  # 针对事件全部股票列表在时间发生时间的次\3\5日平均涨幅
        total_rate3 = 0
        total_rate5 = 0

        for one in event_detail_info:
            happen_dt = one.get("PubTime")
            secuCode = one.get("SecuCode")
            temp_record = dict()
            # (4) 获取单只证券在发生时间后(包括当日)的5日涨幅
            fiveday_rateinfo = self.get_fivedays_changepercactual(
                secuCode, happen_dt)

            if fiveday_rateinfo == list() or len(fiveday_rateinfo) != 5:
                print(f"{secuCode} - {happen_dt} 5 日数据不足")
                continue
            temp_record['event_code'] = eventcode
            temp_record['code'] = secuCode
            temp_record['date'] = happen_dt
            temp_record['d1rate'] = fiveday_rateinfo[0].get(
                "ChangePercActual")  # type Decimal
            temp_record['d2rate'] = fiveday_rateinfo[1].get("ChangePercActual")
            temp_record['d3rate'] = fiveday_rateinfo[2].get("ChangePercActual")
            temp_record['d4rate'] = fiveday_rateinfo[3].get("ChangePercActual")
            temp_record['d5rate'] = fiveday_rateinfo[4].get("ChangePercActual")

            winlist_onday.append(
                float(fiveday_rateinfo[0].get("ChangePercActual")))
            winlist_nextday.append(
                float(fiveday_rateinfo[1].get("ChangePercActual")))
            # (5) 计算单只证券的 次\3\5日 累计涨幅
            accumulated_rate2, accumulated_rate3, accumulated_rate5 = self.generate_changepercactual_index(
                fiveday_rateinfo)
            total_rate2 += accumulated_rate2
            total_rate3 += accumulated_rate3
            total_rate5 += accumulated_rate5
            total_rate += float(fiveday_rateinfo[0].get("ChangePercActual"))

            temp_record['d1acc'] = temp_record['d1rate']
            temp_record['d2acc'] = accumulated_rate2
            temp_record['d3acc'] = accumulated_rate3
            temp_record['d5acc'] = accumulated_rate5
            self._yuqing_conn.table_insert(self.process_table_name,
                                           temp_record)

        # (6) 计算当日胜率
        on_count = 0
        for rate in winlist_onday:
            if rate > 0:
                on_count += 1
        onday_winrate = on_count / len(winlist_onday)
        # (7) 计算次日胜率
        next_count = 0
        for rate in winlist_nextday:
            if rate > 0:
                next_count += 1
        nextday_winrate = next_count / len(winlist_nextday)
        # (8) 计算平均涨幅
        average_rate = total_rate / event_count
        average_rate2 = total_rate2 / event_count
        average_rate3 = total_rate3 / event_count
        average_rate5 = total_rate5 / event_count

        record['EventDayChgPerc'] = average_rate
        record['NextDayChgPerc'] = average_rate2
        record['ThreeDayChgPerc'] = average_rate3
        record['FiveDayChgPerc'] = average_rate5
        record['EventDayWinRatio'] = onday_winrate
        record['NextDayWinRatio'] = nextday_winrate
        print(pprint.pformat(record))
        if record is not None:
            self.save_record(record)
        return record

    def save_record(self, record):
        item = copy.deepcopy(record)
        item.pop("EventCode")
        self._yuqing_conn.table_update(self.target_table_name, item,
                                       "EventCode", record.get("EventCode"))

    def launch(self):
        self.crate_middle_process_table()
        self.innercode_map_init()

        eventcode_lst = sorted(self.const_event_codes())
        for event_code in eventcode_lst:
            print(event_code)
            self.process_single_eventcode(event_code)
class JuchaoLiveNewsSpider(object):
    """巨潮快讯爬虫"""
    def __init__(self):
        self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13'
        self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type='
        self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date']
        self.table_name = 'juchao_kuaixun'
        self.name = '巨潮快讯'
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self._juyuan_conn = Connection(
            host=JUY_HOST,
            port=JUY_PORT,
            user=JUY_USER,
            password=JUY_PASSWD,
            database=JUY_DB,
        )
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, '
                          'like Gecko) Chrome/79.0.3945.117 Safari/537.36'
        }

    def get_secu_abbr(self, code):
        """
        从聚源数据库获取证券代码对应的中文简称
        :param code: 无前缀的证券代码
        :return:
        """
        sql = f'''select SecuAbbr from secumain where secucode=%s;'''
        name = self._juyuan_conn.get(sql, code).get("SecuAbbr")
        return name

    def _create_table(self):
        """
        在爬虫数据库中进行巨潮快讯(juchao_kuaixun)的建表操作
        若已经建表则忽略
        :return:
        """
        sql = '''
         CREATE TABLE IF NOT EXISTS `{}` (
          `id` int(11) NOT NULL AUTO_INCREMENT,
          `code` varchar(8) DEFAULT NULL COMMENT '证券代码',
          `name` varchar(16) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '证券简称', 
          `pub_date` datetime NOT NULL COMMENT '发布时间',
          `title` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯标题',
          `type` varchar(16) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯类别',
          `link` varchar(256) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '公告详情页链接',
          `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
          `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
          PRIMARY KEY (`id`),
          UNIQUE KEY `date_title` (`pub_date`, `title`),
          KEY `pub_date` (`pub_date`),
          KEY `update_time` (`UPDATETIMEJZ`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='{}'; 
        '''.format(self.table_name, self.name)
        self._spider_conn.execute(sql)

    def get_redit_link(self, link):
        """获取最终重定向后的网址"""
        resp = self.my_get(link)
        redit_list = resp.history
        try:
            redit_link = redit_list[len(redit_list) - 1].headers["location"]
        except IndexError:
            redit_link = link
        except:
            return None
        return redit_link

    @retry(stop_max_attempt_number=5)
    def my_get(self, link):
        """请求 超时\被ban 时重试"""
        print(f'get in {link} .. ')
        resp = requests.get(link, headers=self.headers, timeout=5)
        return resp

    def start(self):
        """启动入口"""
        self._create_table()
        end_day = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min)
        # 历史(只有最近半年的)
        # start_day = datetime.datetime(2020, 6, 1)  # 确定一个历史的开始时间节点
        start_day = end_day    # 定时增量后每次只重刷当天的数据即可

        _day = start_day
        while _day <= end_day:
            _day_str = _day.strftime("%Y-%m-%d")
            resp = self.my_get(self.api_url.format(_day_str))
            if resp and resp.status_code == 200:
                text = resp.text
                datas = json.loads(text)
                if not datas:
                    print("{} 无公告数据".format(_day_str))
                else:
                    for data in datas:
                        print(data)
                        item = {}
                        # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称
                        announcementTime = time.localtime(int(data.get("announcementTime") / 1000))
                        announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime)
                        item['pub_date'] = announcementTime

                        item['type'] = data.get("type")
                        item['title'] = data.get("title")
                        page_path = data.get("pagePath")
                        if page_path is None:
                            link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format(
                                data.get("textId"), _day_str)
                        else:
                            try:
                                link = self.get_redit_link(page_path)
                            except:
                                link = None
                        if not link:
                            continue
                        item['link'] = link
                        code = data.get("code")
                        if code:
                            item['code'] = code    # 无前缀的证券代码
                            item['name'] = self.get_secu_abbr(code)
                        print(item)
                        self._spider_conn.table_insert(self.table_name, item)
            _day += datetime.timedelta(days=1)
class JuchaoFinanceSpider(object):
    """巨潮历史公告爬虫 """
    def __init__(self):
        self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
        self.fields = ['SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc']
        self.headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.cninfo.com.cn',
            'Origin': 'http://www.cninfo.com.cn',
            'Pragma': 'no-cache',
            'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        }
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self.history_table_name = 'juchao_ant_finance'  # 巨潮历史公告(财务相关)表

    def _create_table(self):
        create_sql = '''
         CREATE TABLE IF NOT EXISTS `juchao_ant_finance` (
          `id` int(11) NOT NULL AUTO_INCREMENT,
          `SecuCode` varchar(8) NOT NULL COMMENT '证券代码',
          `SecuAbbr` varchar(16) NOT NULL COMMENT '证券代码',
          `Category` varchar(32) NOT NULL COMMENT '财报类型',
          `AntId` int(20) NOT NULL COMMENT '巨潮自带公告 ID',
          `AntTime` datetime NOT NULL COMMENT '发布时间',
          `AntTitle` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯标题',
          `AntDoc` varchar(256) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '公告详情页链接',
          `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
          `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
          PRIMARY KEY (`id`),
          UNIQUE KEY `ant_id` (`AntId`),
          KEY `ant_time` (`AntTime`),
          KEY `secucode` (`SecuCode`),
          KEY `update_time` (`UPDATETIMEJZ`)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨潮个股财务类公告' ; 
        '''
        self._spider_conn.insert(create_sql)

    @retrying.retry(stop_max_attempt_number=10)
    def my_post(self, link: str, post_data):
        resp = requests.post(link, headers=self.headers, data=post_data, timeout=10)
        return resp

    def start(self, start_date=None, end_date=None):
        '''网站限制 一次最多只有一年的数据'''
        self._create_table()

        if start_date is None:
            start_date = datetime.datetime.today() - datetime.timedelta(days=10)
        if end_date is None:
            end_date = datetime.datetime.today()

        se_date = "{}~{}".format(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
        print(se_date)

        for page in range(10):
            print("page >> {}".format(page))
            for category in ('category_ndbg_szsh',
                             'category_bndbg_szsh',
                             'category_yjdbg_szsh',
                             'category_sjdbg_szsh',
                             'category_yjygjxz_szsh',
                             # 'category_yjkb_szsh',    # 自定义业绩快报类型
                             ):
                post_data = {
                    'pageNum': page,
                    'pageSize': 30,
                    'column': 'szse',
                    'tabName': 'fulltext',
                    'plate': '',
                    'stock': '',
                    'searchkey': '',
                    'secid': '',
                    # 财务包含以下几个版块: 年报、半年报、一季报、三季报、业绩预告
                    # 年报: category_ndbg_szsh
                    # 半年报: category_bndbg_szsh
                    # 一季报: category_yjdbg_szsh
                    # 三季报: category_sjdbg_szsh
                    # 业绩预告: category_yjygjxz_szsh
                    # 'category': 'category_ndbg_szsh;category_bndbg_szsh;category_yjdbg_szsh;category_sjdbg_szsh;category_yjygjxz_szsh',
                    'category': category,
                    'trade': '',
                    'seDate': se_date,
                    'sortName': '',
                    'sortType': '',
                    'isHLtitle': True,
                }
                # resp = requests.post(self.api, headers=self.headers, data=post_data, timeout=3)
                resp = self.my_post(self.api, post_data)
                if resp.status_code == 200:
                    text = resp.text
                    if text == '':
                        break

                    py_datas = json.loads(text)
                    ants = py_datas.get("announcements")
                    if ants is None:
                        break

                    for ant in ants:
                        item = dict()
                        item['SecuCode'] = ant.get('secCode')
                        item['SecuAbbr'] = ant.get('secName')
                        item['AntId'] = ant.get("announcementId")
                        item['AntTitle'] = ant.get("announcementTitle")
                        item['Category'] = category
                        time_stamp = ant.get("announcementTime") / 1000
                        item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)})
                        item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")})
                        # print(item)
                        self._spider_conn.table_insert(self.history_table_name, item)
                else:
                    print(resp)

        # 搜索业绩快报关键字
        # searchkey: 业绩快报
        for page in range(10):
            print("page >> {}".format(page))
            post_data = {
                'pageNum': page,
                'pageSize': 30,
                'column': 'szse',
                'tabName': 'fulltext',
                'plate': '',
                'stock': '',
                'searchkey': '业绩快报',
                'secid': '',
                'category': '',
                'trade': '',
                'seDate': se_date,
                'sortName': '',
                'sortType': '',
                'isHLtitle': True,
            }
            resp = self.my_post(self.api, post_data)
            if resp.status_code == 200:
                text = resp.text
                if text == '':
                    break

                py_datas = json.loads(text)
                ants = py_datas.get("announcements")
                if ants is None:
                    break

                for ant in ants:
                    item = dict()
                    item['SecuCode'] = ant.get('secCode')
                    item['SecuAbbr'] = ant.get('secName')
                    item['AntId'] = ant.get("announcementId")
                    item['AntTitle'] = ant.get("announcementTitle")
                    item['Category'] = 'category_yjkb_szsh'
                    time_stamp = ant.get("announcementTime") / 1000
                    item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)})
                    item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")})
                    print(item)
                    self._spider_conn.table_insert(self.history_table_name, item)
            else:
                print(resp)
Beispiel #13
0
class JuchaoHistorySpider(object):
    """巨潮历史公告爬虫 """
    def __init__(self):
        self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
        self.fields = [
            'SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc'
        ]
        self.headers = {
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'Host':
            'www.cninfo.com.cn',
            'Origin':
            'http://www.cninfo.com.cn',
            'Pragma':
            'no-cache',
            'Referer':
            'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        }
        self._spider_conn = Connection(
            host=SPIDER_MYSQL_HOST,
            port=SPIDER_MYSQL_PORT,
            user=SPIDER_MYSQL_USER,
            password=SPIDER_MYSQL_PASSWORD,
            database=SPIDER_MYSQL_DB,
        )
        self.history_table_name = 'juchao_ant'  # 巨潮历史公告表

    def start(self, start_date=None):
        if start_date is None:
            start_date = datetime.datetime.today() - datetime.timedelta(
                days=10)

        end_date = datetime.datetime.today()
        se_date = "{}~{}".format(start_date.strftime("%Y-%m-%d"),
                                 end_date.strftime("%Y-%m-%d"))
        print(se_date)

        for page in range(1000):
            print("page >> {}".format(page))
            post_data = {
                'pageNum': page,
                'pageSize': 30,
                'column': 'szse',
                'tabName': 'fulltext',
                'plate': '',
                'stock': '',
                'searchkey': '',
                'secid': '',
                'category': '',
                'trade': '',
                'seDate': se_date,
                'sortName': '',
                'sortType': '',
                'isHLtitle': True,
            }
            resp = requests.post(self.api,
                                 headers=self.headers,
                                 data=post_data,
                                 timeout=3)
            if resp.status_code == 200:
                text = resp.text
                if text == '':
                    break

                py_datas = json.loads(text)
                ants = py_datas.get("announcements")
                if ants is None:
                    break

                for ant in ants:
                    item = dict()
                    item['SecuCode'] = ant.get('secCode')
                    item['SecuAbbr'] = ant.get('secName')
                    item['AntId'] = ant.get("announcementId")
                    item['AntTitle'] = ant.get("announcementTitle")
                    time_stamp = ant.get("announcementTime") / 1000
                    item.update({
                        'AntTime':
                        datetime.datetime.fromtimestamp(time_stamp)
                    })
                    item.update({
                        'AntDoc':
                        'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")
                    })
                    print(item)
                    self._spider_conn.table_insert(self.history_table_name,
                                                   item)
            else:
                print(resp)