def __init__(self, start_time: datetime.datetime = None, end_time: datetime.datetime = None, ): self.source_table = 'dc_ann_event_source_ann_detail' self.target_table = 'sf_secu_announcement_detail' self.tool_table = "secumain" self.trading_table = 'tradingday' self.codes_map = {} self.industry_map = {} self.med_inf_map = {} self.start_time = start_time self.end_time = end_time self.trading_days = None self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) # 主题猎手数据库 self._theme_conn = Connection( host=THE_HOST, port=THE_PORT, user=THE_USER, password=THE_PASSWD, database=THE_DB, )
def __init__(self): self.source_table = 'dc_ann_event_source_ann_detail' self.target_table = 'sf_secu_announcement_summary' self.tool_table = "secumain" self.trading_table = 'tradingday' self.const_table = 'sf_const_announcement' self.codes_map = {} self.sent_map = {} self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, )
def __init__(self): self.merge_table_name = 'announcement_base' self.batch_number = 1000 self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) self._r_spider_conn = Connection( host=SPIDER_MYSQL_HOST2, port=SPIDER_MYSQL_PORT2, user=SPIDER_MYSQL_USER2, password=SPIDER_MYSQL_PASSWORD2, database=SPIDER_MYSQL_DB2, )
def __init__(self): self.dc_table_name = 'stk_quot_idx' # 日行情指标表 从 datacenter 数据库中获取 self.target_table_name = 'sf_const_announcement' # 目标数据库: 公告事件常量表 self.source_table_name = 'dc_ann_event_source_ann_detail' # 源数据库: 公告明细表 self.tool_table_name = 'secumain' self.process_table_name = 'const_process_records' self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) # 数据中心读数据库 self._dc_conn = Connection( host=DC_HOST, port=DC_PORT, user=DC_USER, password=DC_PASSWD, database=DC_DB, ) self.today = datetime.datetime.now() self.today_of_lastyear = self.today - datetime.timedelta(days=365) self.innercode_map = None self.temp_fields = [ 'event_code', 'code', 'date', 'd1rate', 'd2rate', 'd3rate', 'd4rate', 'd5rate', 'd1acc', 'd2acc', 'd3acc', 'd5acc', ]
def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.fields = [ 'SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc' ] self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self.history_table_name = 'juchao_ant' # 巨潮历史公告表
def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' }
class SourceAnnouncementBase(object): """将两个爬虫表合并生成公告基础表""" def __init__(self): self.merge_table_name = 'announcement_base' self.batch_number = 1000 self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) self._r_spider_conn = Connection( host=SPIDER_MYSQL_HOST2, port=SPIDER_MYSQL_PORT2, user=SPIDER_MYSQL_USER2, password=SPIDER_MYSQL_PASSWORD2, database=SPIDER_MYSQL_DB2, ) def launch(self): deadline = datetime.datetime.now() - datetime.timedelta(hours=5) load_sql = '''select id, SecuCode, SecuAbbr, AntTime as PubDatetime1, AntTitle as Title1, \ AntDoc as PDFLink, CREATETIMEJZ as InsertDatetime1 from juchao_ant where UPDATETIMEJZ > '{}'; '''.format( deadline) datas = self._r_spider_conn.query(load_sql) datas = [utils.process_secucode(data) for data in datas] datas = [data for data in datas if data is not None] for data in datas: print(data) self._yuqing_conn.table_insert(self.merge_table_name, data) update_sql = '''select A.* from juchao_kuaixun A, juchao_ant B where A.pub_date > '{}' \ and A.code = B.SecuCode and A.link = B.AntDoc and A.type = '公告'; '''.format( deadline) datas = self._r_spider_conn.query(update_sql) for data in datas: print(data) item = { 'PubDatetime2': data.get("pub_date"), 'InsertDatetime2': data.get("CREATETIMEJZ"), 'Title2': data.get("title"), } self._yuqing_conn.table_update(self.merge_table_name, item, 'PDFLink', data.get("link"))
class FinalAntSummary(object): def __init__(self): self.source_table = 'dc_ann_event_source_ann_detail' self.target_table = 'sf_secu_announcement_summary' self.tool_table = "secumain" self.trading_table = 'tradingday' self.const_table = 'sf_const_announcement' self.codes_map = {} self.sent_map = {} self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) def get_inner_code_map(self): sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format( self.tool_table, self.source_table, ) ret = self._yuqing_conn.query(sql) for r in ret: self.codes_map[r.get('SecuCode')] = r.get('InnerCode') def get_sentiment_map(self): sql = '''select EventCode, Sentiment from {} ; '''.format( self.const_table) ret = self._yuqing_conn.query(sql) for r in ret: self.sent_map[r.get('EventCode')] = r.get("Sentiment") def get_nearest_trading_day(self, day: datetime.datetime): """获取距离当前时间最近的向前一个交易日 包括当前的日期""" while True: sql = '''select IfTradingDay from {} where Date = '{}' and SecuMarket = 83 ; '''.format( self.trading_table, day) is_trading_day = self._yuqing_conn.get(sql).get("IfTradingDay") if is_trading_day == 1: return day day -= datetime.timedelta(days=1) def get_update_endtime(self): sql = '''select max(PubTime) as max_pub from {}; '''.format( self.source_table) try: max_pub = self._yuqing_conn.get(sql).get("max_pub") except: max_pub = None return max_pub def get_update_starttime(self): sql = '''select max(TradeDate) as max_trd from {}; '''.format( self.target_table) try: max_trd = self._yuqing_conn.get(sql).get("max_trd") except: max_trd = None return max_trd def daily_update(self): update_end = self.get_update_endtime() update_start = self.get_update_starttime() print(update_start, update_end) self.get_inner_code_map() self.get_sentiment_map() sql = '''select AnnID, SecuCode, PubTime, EventCode, Title, PDFLink from {} where PubTime between '{}' and '{}'; '''.format( self.source_table, update_start, update_end) datas = self._yuqing_conn.query(sql) for data in datas: ann_id = data.get("AnnID") pub_time = data.get("PubTime") event_code = data.get("EventCode") title = data.get("Title") link = data.get("PDFLink") secu_code = data.get("SecuCode") inner_code = self.codes_map.get(secu_code) pub_day = datetime.datetime(pub_time.year, pub_time.month, pub_time.day) trade_day = self.get_nearest_trading_day(pub_day) sentiment = self.sent_map.get(event_code) item = {} item['SecuCode'] = secu_code item['InnerCode'] = inner_code item['TradeDate'] = trade_day item['Sentiment'] = sentiment item['EventCode'] = event_code item['AnnID'] = ann_id item['AnnTitle'] = title item['Website'] = link print(item) self._yuqing_conn.table_insert(self.target_table, item)
class FinalAntDetail(object): def __init__(self, start_time: datetime.datetime = None, end_time: datetime.datetime = None, ): self.source_table = 'dc_ann_event_source_ann_detail' self.target_table = 'sf_secu_announcement_detail' self.tool_table = "secumain" self.trading_table = 'tradingday' self.codes_map = {} self.industry_map = {} self.med_inf_map = {} self.start_time = start_time self.end_time = end_time self.trading_days = None self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) # 主题猎手数据库 self._theme_conn = Connection( host=THE_HOST, port=THE_PORT, user=THE_USER, password=THE_PASSWD, database=THE_DB, ) def get_inner_code_map(self): sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format( self.tool_table, self.source_table, ) ret = self._yuqing_conn.query(sql) for r in ret: self.codes_map[r.get('SecuCode')] = r.get('InnerCode') def get_day(self, dt: datetime.datetime): return datetime.datetime(dt.year, dt.month, dt.day) def get_industry_code_map(self): sql = '''select A.code as IndustryCode, A.name as IndustryName, B.code as SecuCode, \ B.name as SecuAbbr from block A, block_code B where A.type = 1 and A.id = B.bid ;''' ret = self._theme_conn.query(sql) for r in ret: self.industry_map[r.get("SecuCode")] = r.get("IndustryCode") def get_all_trading_days(self, start_dt: datetime.datetime, end_dt:datetime.datetime): """ 获取起止时间之间的全部交易日并且排序 :param start_dt: :param end_dt: :return: """ sql = '''select Date from {} where SecuMarket = 83 and IfTradingDay = 1 and Date between '{}' and '{}'; '''.format( self.trading_table, start_dt, end_dt ) trading_days = self._yuqing_conn.query(sql) trading_days = sorted([trading_day.get("Date") for trading_day in trading_days]) return trading_days def get_after_five_trading_days(self, dt: datetime.datetime): """当日(包括当日之后的)5 个交易日 将一段时间内的全部交易日全部拿出来放在一个列表中排序 取某一天的时间的索引(若不存在取之后最近的一个)然后顺次取之后第五个索引即可 未击中时间范围时重新计算 """ while True: try: _index = self.trading_days.index(dt) except ValueError: dt += datetime.timedelta(days=1) else: break return self.trading_days[_index], self.trading_days[_index+4] def get_med_inf_map(self): sql = '''select MedName, InfluenceWeight from dc_const_media_info ; ''' ret = self._yuqing_conn.query(sql) for r in ret: self.med_inf_map[r.get("MedName")] = int(r.get("InfluenceWeight")) def get_meds_scores(self, meds: list): """ 计算媒体得分 得分有 1 3 10 100 无名媒体(None) 计 1 分 :param meds: :return: """ total_score = 0 r_meds = [] for med in meds: if med is None: total_score += 1 else: r_meds.append(med) if len(r_meds) != 0: scores = [self.med_inf_map.get(r_med, 1) for r_med in r_meds] total_score += sum(scores) return total_score def get_news_num(self, secu_code: str, event_code: str, min_trading_day: datetime.datetime, max_trading_day: datetime.datetime): """ 统计新闻发布时间在公告发布时间之后的所有关联篇数, 最多统计发布日之后的所有关联篇数, 最多统计发布日(包括当日)之后的 5 个交易日之间的新闻,超过 5 个交易日就不需要去更新这条记录了. select * from dc_ann_event_source_news_detail A where A.SecuCode = '{}' and A.EventCode = '{}' and PubTime between {} amd {} ; """ sql = '''select MedName from dc_ann_event_source_news_detail where SecuCode = '{}' \ and EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format( secu_code, event_code, min_trading_day, max_trading_day) datas = self._yuqing_conn.query(sql) count = len(datas) total_scores = 0 if count != 0: meds = [data.get("MedName") for data in datas] # dict total_scores = self.get_meds_scores(meds) return count, total_scores def get_post_num(self, secu_code: str, event_code: str, min_trading_day: datetime.datetime, max_trading_day: datetime.datetime): sql = '''select count(*) as count from dc_ann_event_source_guba_detail where SecuCode = '{}' \ and EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format( secu_code, event_code, min_trading_day, max_trading_day) count = self._yuqing_conn.get(sql).get("count") return count def process_data(self, data: dict): secu_code = data.get("SecuCode") inner_code = self.codes_map.get(secu_code) event_code = data.get("EventCode") pub_time = data.get("PubTime") pub_date = self.get_day(pub_time) link = data.get("PDFLink") industry_code = self.industry_map.get(secu_code) min_trading_day, max_trading_day = self.get_after_five_trading_days(pub_date) item = {} item['SecuCode'] = secu_code item['InnerCode'] = inner_code item['EventCode'] = event_code item['PubDatetime'] = pub_time item['PubDate'] = pub_date item['Website'] = link item['IndustryCode'] = industry_code news_num, scores = self.get_news_num(secu_code, event_code, min_trading_day, max_trading_day) item['NewsNum'] = news_num item['Influence'] = scores item['PostNum'] = self.get_post_num(secu_code, event_code, min_trading_day, max_trading_day) return item def get_update_start(self): try: sql = '''select max(PubDatetime) as max_dt from {} ; '''.format(self.target_table) max_dt = self._yuqing_conn.get(sql).get("max_dt") except: max_dt = None return max_dt def get_update_end(self): return datetime.datetime.now() def daily_update(self): update_start = self.get_update_start() update_start = update_start - datetime.timedelta(days=7) update_end = self.get_update_end() print(update_start, update_end) self.get_inner_code_map() self.get_industry_code_map() self.get_med_inf_map() if not self.trading_days: self.trading_days = self.get_all_trading_days(update_start, update_end+datetime.timedelta(days=30)) sql = '''select SecuCode, EventCode, PubTime, PDFLink from {} where PubTime between '{}' and '{}'; '''.format( self.source_table, update_start, update_end) datas = self._yuqing_conn.query(sql) for data in datas: item = self.process_data(data) print(item) self._yuqing_conn.table_insert(self.target_table, item)
class FinalConstAnn(object): def __init__(self): self.dc_table_name = 'stk_quot_idx' # 日行情指标表 从 datacenter 数据库中获取 self.target_table_name = 'sf_const_announcement' # 目标数据库: 公告事件常量表 self.source_table_name = 'dc_ann_event_source_ann_detail' # 源数据库: 公告明细表 self.tool_table_name = 'secumain' self.process_table_name = 'const_process_records' self._yuqing_conn = Connection( host=YQ_HOST, port=YQ_PORT, user=YQ_USER, password=YQ_PASSWD, database=YQ_DB, ) # 数据中心读数据库 self._dc_conn = Connection( host=DC_HOST, port=DC_PORT, user=DC_USER, password=DC_PASSWD, database=DC_DB, ) self.today = datetime.datetime.now() self.today_of_lastyear = self.today - datetime.timedelta(days=365) self.innercode_map = None self.temp_fields = [ 'event_code', 'code', 'date', 'd1rate', 'd2rate', 'd3rate', 'd4rate', 'd5rate', 'd1acc', 'd2acc', 'd3acc', 'd5acc', ] def innercode_map_init(self): if not self.innercode_map: self.innercode_map = self.get_inner_code_map() def get_inner_code_map(self): sql = '''select SecuCode, InnerCode from {} where SecuCode in (select distinct(SecuCode) from {}); '''.format( self.tool_table_name, self.source_table_name) __map = {} try: res = self._yuqing_conn.query(sql) for r in res: __map[r.get("SecuCode")] = r.get("InnerCode") return __map except: traceback.print_exc() return {} def const_event_codes(self): sql = '''select distinct(EventCode) from {};'''.format( self.target_table_name) try: res = self._yuqing_conn.query(sql) event_codes = [one.get("EventCode") for one in res] return event_codes except: traceback.print_exc() return [] def get_event_detail(self, event_code: str): sql = '''select PubTime, SecuCode from {} where EventCode = '{}' and PubTime between '{}' and '{}' ;'''.format( self.source_table_name, event_code, self.today_of_lastyear, self.today) print(sql) try: res = self._yuqing_conn.query(sql) return res except: traceback.print_exc() return [] def get_fivedays_changepercactual(self, secucode: str, dt: datetime.datetime): innercode = self.innercode_map.get(secucode) if not innercode: return [] sql = '''select Date, ChangePercActual from {} where InnerCode = '{}' and Date >= '{}' \ order by Date limit 5;'''.format(self.dc_table_name, innercode, dt) try: res = self._dc_conn.query(sql) return res except: traceback.print_exc() return [] def generate_changepercactual_index(self, index_datas): """计算单只证券的 次\3\5日 累计涨幅""" cpt_scores = [ float(data.get("ChangePercActual")) / 100 for data in index_datas ] # print("&&&&&", cpt_scores) days_len = len(cpt_scores) x = y = z = m = n = None if days_len == 0 or days_len == 1: return [] elif days_len == 2: x, y = cpt_scores elif days_len == 3: x, y, z = cpt_scores elif days_len == 4: x, y, z, m = cpt_scores else: x, y, z, m, n = cpt_scores # print("*****", x, y, z, m, n) ret1 = ret2 = ret3 = None # 次日累计涨幅 ret1 = (1 + x) * (1 + y) - 1 # 3 日累计涨幅 if z is not None: ret2 = (1 + x) * (1 + y) * (1 + z) - 1 if m is not None and n is not None: # 5 日累计涨幅 ret3 = (1 + x) * (1 + y) * (1 + z) * (1 + m) * (1 + n) - 1 # print(">>>>>", ret1, ret2, ret3) return [ret1 * 100, ret2 * 100, ret3 * 100] def crate_middle_process_table(self): """记录中间数据的临时表, 方便校对""" sql = ''' CREATE TABLE IF NOT EXISTS `const_process_records`( `event_code` varchar(50) NOT NULL COMMENT '事件代码', `code` varchar(50) NOT NULL COMMENT '事件代码', `date` datetime NOT NULL COMMENT '日期', `d1rate` decimal(10,4) NOT NULL, `d2rate` decimal(10,4) NOT NULL, `d3rate` decimal(10,4) NOT NULL, `d4rate` decimal(10,4) NOT NULL, `d5rate` decimal(10,4) NOT NULL, `d1acc` decimal(10,4) NOT NULL, `d2acc` decimal(10,4) NOT NULL, `d3acc` decimal(10,4) NOT NULL, `d5acc` decimal(10,4) NOT NULL, `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, UNIQUE KEY `u1` (`event_code`,`code`, `date`) )ENGINE=InnoDB DEFAULT CHARSET=utf8; ''' print(sql) self._yuqing_conn.insert(sql) def process_single_eventcode(self, eventcode: str): print("NOW IS:", eventcode) record = {} record["EventCode"] = eventcode # (3) 对于某一个事件来说, 近一年发生该事件的证券以及发生时间列表 event_detail_info = self.get_event_detail(eventcode) print(f"EVENT DETAIL COUNT: {len(event_detail_info)}") event_count = len(event_detail_info) if event_count == 0: print(f"NO RVENT OF CODE {eventcode}") return None winlist_onday = [] # 针对事件全部股票列表在时间发生时间的当日胜率与次日胜率 winlist_nextday = [] total_rate = 0 total_rate2 = 0 # 针对事件全部股票列表在时间发生时间的次\3\5日平均涨幅 total_rate3 = 0 total_rate5 = 0 for one in event_detail_info: happen_dt = one.get("PubTime") secuCode = one.get("SecuCode") temp_record = dict() # (4) 获取单只证券在发生时间后(包括当日)的5日涨幅 fiveday_rateinfo = self.get_fivedays_changepercactual( secuCode, happen_dt) if fiveday_rateinfo == list() or len(fiveday_rateinfo) != 5: print(f"{secuCode} - {happen_dt} 5 日数据不足") continue temp_record['event_code'] = eventcode temp_record['code'] = secuCode temp_record['date'] = happen_dt temp_record['d1rate'] = fiveday_rateinfo[0].get( "ChangePercActual") # type Decimal temp_record['d2rate'] = fiveday_rateinfo[1].get("ChangePercActual") temp_record['d3rate'] = fiveday_rateinfo[2].get("ChangePercActual") temp_record['d4rate'] = fiveday_rateinfo[3].get("ChangePercActual") temp_record['d5rate'] = fiveday_rateinfo[4].get("ChangePercActual") winlist_onday.append( float(fiveday_rateinfo[0].get("ChangePercActual"))) winlist_nextday.append( float(fiveday_rateinfo[1].get("ChangePercActual"))) # (5) 计算单只证券的 次\3\5日 累计涨幅 accumulated_rate2, accumulated_rate3, accumulated_rate5 = self.generate_changepercactual_index( fiveday_rateinfo) total_rate2 += accumulated_rate2 total_rate3 += accumulated_rate3 total_rate5 += accumulated_rate5 total_rate += float(fiveday_rateinfo[0].get("ChangePercActual")) temp_record['d1acc'] = temp_record['d1rate'] temp_record['d2acc'] = accumulated_rate2 temp_record['d3acc'] = accumulated_rate3 temp_record['d5acc'] = accumulated_rate5 self._yuqing_conn.table_insert(self.process_table_name, temp_record) # (6) 计算当日胜率 on_count = 0 for rate in winlist_onday: if rate > 0: on_count += 1 onday_winrate = on_count / len(winlist_onday) # (7) 计算次日胜率 next_count = 0 for rate in winlist_nextday: if rate > 0: next_count += 1 nextday_winrate = next_count / len(winlist_nextday) # (8) 计算平均涨幅 average_rate = total_rate / event_count average_rate2 = total_rate2 / event_count average_rate3 = total_rate3 / event_count average_rate5 = total_rate5 / event_count record['EventDayChgPerc'] = average_rate record['NextDayChgPerc'] = average_rate2 record['ThreeDayChgPerc'] = average_rate3 record['FiveDayChgPerc'] = average_rate5 record['EventDayWinRatio'] = onday_winrate record['NextDayWinRatio'] = nextday_winrate print(pprint.pformat(record)) if record is not None: self.save_record(record) return record def save_record(self, record): item = copy.deepcopy(record) item.pop("EventCode") self._yuqing_conn.table_update(self.target_table_name, item, "EventCode", record.get("EventCode")) def launch(self): self.crate_middle_process_table() self.innercode_map_init() eventcode_lst = sorted(self.const_event_codes()) for event_code in eventcode_lst: print(event_code) self.process_single_eventcode(event_code)
class JuchaoLiveNewsSpider(object): """巨潮快讯爬虫""" def __init__(self): self.web_url = 'http://www.cninfo.com.cn/new/commonUrl/quickNews?url=/disclosure/quickNews&queryDate=2020-08-13' self.api_url = 'http://www.cninfo.com.cn/new/quickNews/queryQuickNews?queryDate={}&type=' self.fields = ['code', 'name', 'link', 'title', 'type', 'pub_date'] self.table_name = 'juchao_kuaixun' self.name = '巨潮快讯' self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self._juyuan_conn = Connection( host=JUY_HOST, port=JUY_PORT, user=JUY_USER, password=JUY_PASSWD, database=JUY_DB, ) self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/79.0.3945.117 Safari/537.36' } def get_secu_abbr(self, code): """ 从聚源数据库获取证券代码对应的中文简称 :param code: 无前缀的证券代码 :return: """ sql = f'''select SecuAbbr from secumain where secucode=%s;''' name = self._juyuan_conn.get(sql, code).get("SecuAbbr") return name def _create_table(self): """ 在爬虫数据库中进行巨潮快讯(juchao_kuaixun)的建表操作 若已经建表则忽略 :return: """ sql = ''' CREATE TABLE IF NOT EXISTS `{}` ( `id` int(11) NOT NULL AUTO_INCREMENT, `code` varchar(8) DEFAULT NULL COMMENT '证券代码', `name` varchar(16) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '证券简称', `pub_date` datetime NOT NULL COMMENT '发布时间', `title` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯标题', `type` varchar(16) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯类别', `link` varchar(256) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '公告详情页链接', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `date_title` (`pub_date`, `title`), KEY `pub_date` (`pub_date`), KEY `update_time` (`UPDATETIMEJZ`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='{}'; '''.format(self.table_name, self.name) self._spider_conn.execute(sql) def get_redit_link(self, link): """获取最终重定向后的网址""" resp = self.my_get(link) redit_list = resp.history try: redit_link = redit_list[len(redit_list) - 1].headers["location"] except IndexError: redit_link = link except: return None return redit_link @retry(stop_max_attempt_number=5) def my_get(self, link): """请求 超时\被ban 时重试""" print(f'get in {link} .. ') resp = requests.get(link, headers=self.headers, timeout=5) return resp def start(self): """启动入口""" self._create_table() end_day = datetime.datetime.combine(datetime.datetime.now(), datetime.time.min) # 历史(只有最近半年的) # start_day = datetime.datetime(2020, 6, 1) # 确定一个历史的开始时间节点 start_day = end_day # 定时增量后每次只重刷当天的数据即可 _day = start_day while _day <= end_day: _day_str = _day.strftime("%Y-%m-%d") resp = self.my_get(self.api_url.format(_day_str)) if resp and resp.status_code == 200: text = resp.text datas = json.loads(text) if not datas: print("{} 无公告数据".format(_day_str)) else: for data in datas: print(data) item = {} # 需要保存的字段: 快讯的发布详细时间、类型、标题、地址、股票代码、股票名称 announcementTime = time.localtime(int(data.get("announcementTime") / 1000)) announcementTime = time.strftime("%Y-%m-%d %H:%M:%S", announcementTime) item['pub_date'] = announcementTime item['type'] = data.get("type") item['title'] = data.get("title") page_path = data.get("pagePath") if page_path is None: link = '''http://www.cninfo.com.cn/new/disclosure/detail?stock=&announcementId={}&announcementTime={}'''.format( data.get("textId"), _day_str) else: try: link = self.get_redit_link(page_path) except: link = None if not link: continue item['link'] = link code = data.get("code") if code: item['code'] = code # 无前缀的证券代码 item['name'] = self.get_secu_abbr(code) print(item) self._spider_conn.table_insert(self.table_name, item) _day += datetime.timedelta(days=1)
class JuchaoFinanceSpider(object): """巨潮历史公告爬虫 """ def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.fields = ['SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc'] self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self.history_table_name = 'juchao_ant_finance' # 巨潮历史公告(财务相关)表 def _create_table(self): create_sql = ''' CREATE TABLE IF NOT EXISTS `juchao_ant_finance` ( `id` int(11) NOT NULL AUTO_INCREMENT, `SecuCode` varchar(8) NOT NULL COMMENT '证券代码', `SecuAbbr` varchar(16) NOT NULL COMMENT '证券代码', `Category` varchar(32) NOT NULL COMMENT '财报类型', `AntId` int(20) NOT NULL COMMENT '巨潮自带公告 ID', `AntTime` datetime NOT NULL COMMENT '发布时间', `AntTitle` varchar(128) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '资讯标题', `AntDoc` varchar(256) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL COMMENT '公告详情页链接', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `ant_id` (`AntId`), KEY `ant_time` (`AntTime`), KEY `secucode` (`SecuCode`), KEY `update_time` (`UPDATETIMEJZ`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='巨潮个股财务类公告' ; ''' self._spider_conn.insert(create_sql) @retrying.retry(stop_max_attempt_number=10) def my_post(self, link: str, post_data): resp = requests.post(link, headers=self.headers, data=post_data, timeout=10) return resp def start(self, start_date=None, end_date=None): '''网站限制 一次最多只有一年的数据''' self._create_table() if start_date is None: start_date = datetime.datetime.today() - datetime.timedelta(days=10) if end_date is None: end_date = datetime.datetime.today() se_date = "{}~{}".format(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")) print(se_date) for page in range(10): print("page >> {}".format(page)) for category in ('category_ndbg_szsh', 'category_bndbg_szsh', 'category_yjdbg_szsh', 'category_sjdbg_szsh', 'category_yjygjxz_szsh', # 'category_yjkb_szsh', # 自定义业绩快报类型 ): post_data = { 'pageNum': page, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext', 'plate': '', 'stock': '', 'searchkey': '', 'secid': '', # 财务包含以下几个版块: 年报、半年报、一季报、三季报、业绩预告 # 年报: category_ndbg_szsh # 半年报: category_bndbg_szsh # 一季报: category_yjdbg_szsh # 三季报: category_sjdbg_szsh # 业绩预告: category_yjygjxz_szsh # 'category': 'category_ndbg_szsh;category_bndbg_szsh;category_yjdbg_szsh;category_sjdbg_szsh;category_yjygjxz_szsh', 'category': category, 'trade': '', 'seDate': se_date, 'sortName': '', 'sortType': '', 'isHLtitle': True, } # resp = requests.post(self.api, headers=self.headers, data=post_data, timeout=3) resp = self.my_post(self.api, post_data) if resp.status_code == 200: text = resp.text if text == '': break py_datas = json.loads(text) ants = py_datas.get("announcements") if ants is None: break for ant in ants: item = dict() item['SecuCode'] = ant.get('secCode') item['SecuAbbr'] = ant.get('secName') item['AntId'] = ant.get("announcementId") item['AntTitle'] = ant.get("announcementTitle") item['Category'] = category time_stamp = ant.get("announcementTime") / 1000 item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)}) item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")}) # print(item) self._spider_conn.table_insert(self.history_table_name, item) else: print(resp) # 搜索业绩快报关键字 # searchkey: 业绩快报 for page in range(10): print("page >> {}".format(page)) post_data = { 'pageNum': page, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext', 'plate': '', 'stock': '', 'searchkey': '业绩快报', 'secid': '', 'category': '', 'trade': '', 'seDate': se_date, 'sortName': '', 'sortType': '', 'isHLtitle': True, } resp = self.my_post(self.api, post_data) if resp.status_code == 200: text = resp.text if text == '': break py_datas = json.loads(text) ants = py_datas.get("announcements") if ants is None: break for ant in ants: item = dict() item['SecuCode'] = ant.get('secCode') item['SecuAbbr'] = ant.get('secName') item['AntId'] = ant.get("announcementId") item['AntTitle'] = ant.get("announcementTitle") item['Category'] = 'category_yjkb_szsh' time_stamp = ant.get("announcementTime") / 1000 item.update({'AntTime': datetime.datetime.fromtimestamp(time_stamp)}) item.update({'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl")}) print(item) self._spider_conn.table_insert(self.history_table_name, item) else: print(resp)
class JuchaoHistorySpider(object): """巨潮历史公告爬虫 """ def __init__(self): self.api = 'http://www.cninfo.com.cn/new/hisAnnouncement/query' self.fields = [ 'SecuCode', 'SecuAbbr', 'AntId', 'AntTime', 'AntTitle', 'AntDoc' ] self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cninfo.com.cn', 'Origin': 'http://www.cninfo.com.cn', 'Pragma': 'no-cache', 'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', } self._spider_conn = Connection( host=SPIDER_MYSQL_HOST, port=SPIDER_MYSQL_PORT, user=SPIDER_MYSQL_USER, password=SPIDER_MYSQL_PASSWORD, database=SPIDER_MYSQL_DB, ) self.history_table_name = 'juchao_ant' # 巨潮历史公告表 def start(self, start_date=None): if start_date is None: start_date = datetime.datetime.today() - datetime.timedelta( days=10) end_date = datetime.datetime.today() se_date = "{}~{}".format(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")) print(se_date) for page in range(1000): print("page >> {}".format(page)) post_data = { 'pageNum': page, 'pageSize': 30, 'column': 'szse', 'tabName': 'fulltext', 'plate': '', 'stock': '', 'searchkey': '', 'secid': '', 'category': '', 'trade': '', 'seDate': se_date, 'sortName': '', 'sortType': '', 'isHLtitle': True, } resp = requests.post(self.api, headers=self.headers, data=post_data, timeout=3) if resp.status_code == 200: text = resp.text if text == '': break py_datas = json.loads(text) ants = py_datas.get("announcements") if ants is None: break for ant in ants: item = dict() item['SecuCode'] = ant.get('secCode') item['SecuAbbr'] = ant.get('secName') item['AntId'] = ant.get("announcementId") item['AntTitle'] = ant.get("announcementTitle") time_stamp = ant.get("announcementTime") / 1000 item.update({ 'AntTime': datetime.datetime.fromtimestamp(time_stamp) }) item.update({ 'AntDoc': 'http://static.cninfo.com.cn/' + ant.get("adjunctUrl") }) print(item) self._spider_conn.table_insert(self.history_table_name, item) else: print(resp)