def sz_start(self): self._spider_init() for page in range(1, 8): logger.info("page is {}".format(page)) datas = self._make_sz_params(page) resp = requests.post(self.sz_url, headers=self.headers, data=datas) # print(resp) if resp.status_code == 200: ret = resp.text py_ret = json.loads(ret) announcements = py_ret.get("data") items = [] for a in announcements: # print(a) item = dict() item['market'] = 90 # 深交所 item['title'] = a.get("doctitle") item['link'] = a.get("docpuburl") item['time'] = self.trans_dt(a.get('docpubtime')) # eg. http://www.szse.cn/disclosure/notice/general/t20200430_576647.json content_json_url = urljoin("http://www.szse.cn", a.get("docpubjsonurl")) content = self.parse_json_content(content_json_url) item['content'] = content items.append(item) print(items) ret = self._batch_save(self.spider_client, items, self.announcement_table, self.firelds) else: self.error_pages.append(page)
def diff_task(): """对比两天清单差异的任务""" now = lambda: time.time() start_time = now() SzGener().start() logger.info( f"用时: {now() - start_time} 秒") # (end)大概是 80s (dispose)大概是 425s
def crawl(self): # (1)建表 self._create_table() # (2) 下载昨天和前天的数据文件 _today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min) _yester_day = _today - datetime.timedelta(days=1) print(_yester_day) _before_yester_day = _today - datetime.timedelta(days=2) print(_before_yester_day) _yester_file = self.load_xlsx(_yester_day) print(">>>> ") _before_yester_day_file = self.load_xlsx(_before_yester_day) print("<<<<") # (3) 将数据存入数据库 save_ret1 = self.read_xls(_yester_file, _yester_day) save_ret2 = self.read_xls(_before_yester_day_file, _before_yester_day) logger.info("入库结果: {} >>> {}, {} >>> {}".format( _yester_day, save_ret1, _before_yester_day, save_ret2)) local_str = "本地" if LOCAL else "远程" self.ding("{}: 深交所融资融券清单爬虫入库结果: {} >>{}, {} >>{} ".format( local_str, _yester_day, save_ret1, _before_yester_day, save_ret2))
def start(self): msg = '' local_str = "本地测试: " if LOCAL else "远程: " msg += local_str msg += '上交所数据生成:\n' _today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min) _yester_day = _today - datetime.timedelta(days=1) _before_yester_day = _today - datetime.timedelta(days=2) print(_yester_day, "***", _before_yester_day) msg += '时间点: {} 与 {}\n'.format(_yester_day, _before_yester_day) _type_str_map = { 10: "融资", 20: "融券", } for _type in (10, 20): logger.info(_type) # 昨日的清单 _yester_day_list = self.get_spider_dt_list(_yester_day, _type) # 前日的清单 _before_yester_day_list = self.get_spider_dt_list( _before_yester_day, _type) print(len(_yester_day_list), len(_before_yester_day_list)) if _yester_day and _before_yester_day: to_add = set(_yester_day_list) - set(_before_yester_day_list) to_delete = set(_before_yester_day_list) - set( _yester_day_list) logger.info("需新增数据: {}, 需删除数据: {}".format(to_add, to_delete)) msg += "{}: 需新增数据: {}, 需删除数据: {}\n".format( _type_str_map.get(_type), to_add, to_delete) if to_add: for one in to_add: # 数据 时间 融资融券类型 移入移出类型 self._update(one, _yester_day, _type, 1) if to_delete: for one in to_delete: self._update(one, _yester_day, _type, 0) msg += '一致性检查: \n' # 状态计算清单 dc_list_10 = set(sorted(self.product_dt_datas(83, 10))) # 真正清单 spider_list_10 = set(sorted(self.get_spider_latest_list(83, 10))) msg += "融资一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format( dc_list_10 - spider_list_10, spider_list_10 - dc_list_10) dc_list_20 = set(sorted(self.product_dt_datas(83, 20))) spider_list_20 = set(sorted(self.get_spider_latest_list(83, 20))) msg += "融券一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format( dc_list_20 - spider_list_20, spider_list_20 - dc_list_20) print(msg) self.ding(msg)
def start(self): self._create_table() self.sh_start() self.sz_start() logger.info(self.error_urls) logger.info(self.error_pages)
def get_spider_dt_list(self, dt, category): """获取爬虫库中具体某一天的清单""" self._spider_init() sql_dt = '''select max(ListDate) as mx from {} where ListDate <= '{}' and SecuMarket =83 and TargetCategory = {}; '''.format(self.spider_table_name, dt, category) dt_ = self.spider_client.select_one(sql_dt).get("mx") logger.info("距离 {} 最近的之前的一天是{}".format(dt, dt_)) if dt_: sql = '''select InnerCode from {} where ListDate = '{}' and SecuMarket = 83 and TargetCategory = {}; '''.format(self.spider_table_name, dt_, category) ret = self.spider_client.select_all(sql) ret = [r.get("InnerCode") for r in ret] return ret else: return []
def check2(): # 融券 latest_list_spider = self.dt_datas(_today, 2) latest_list_spider = set(sorted(latest_list_spider)) target = self._init_pool(self.product_cfg) sql = 'select InnerCode from {} where SecuMarket = 90 and TargetFlag = 1 and TargetCategory = 20; '.format( self.target_table_name) ret = target.select_all(sql) dc_list = set(sorted([r.get("InnerCode") for r in ret])) r1 = dc_list - latest_list_spider r2 = latest_list_spider - dc_list msg = "融券一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{}".format( r1, r2) logger.info(msg) return msg
def history_diff(self, dt1, dt2, type=1): """ 将历史数据中某两天的数据进行 diff dt1 是变更发生的时间 dt2 是变更发生的前一天 type: 1 是融资 2 是融券 """ data1 = self.dt_datas(dt1, type) data1 = set(sorted(data1)) data2 = self.dt_datas(dt2, type) data2 = set(sorted(data2)) to_add_set = data1 - data2 to_delete_set = data2 - data1 logger.info("要增加的标的: {}".format(to_add_set)) logger.info("要剔除的标的: {}".format(to_delete_set)) return to_add_set, to_delete_set
def _create_table(self): """对公告爬虫建表 """ self._spider_init() sql = ''' CREATE TABLE IF NOT EXISTS `{}` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID', `market` int(11) DEFAULT NULL COMMENT '证券市场', `title` varchar(200) DEFAULT NULL COMMENT '公告标题', `link` varchar(200) DEFAULT NULL COMMENT '公告链接', `time` datetime NOT NULL COMMENT '公告发布时间', `content` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告内容', `keyword` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告关键词', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `un2` (`market`, `link`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='交易所融资融券公告信息'; '''.format(self.announcement_table) self.spider_client.insert(sql) logger.info("爬虫公告表建表成功")
def start(self): msg = '' # 建表[远程没有建表的权限] if LOCAL: self._create_table() # 将聚源数据库的数据导出 if FIRST: self.load_juyuan() logger.info("已经导出聚源数据库") self.parse_announcemen_byhuman() _today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min) _yester_day = _today - datetime.timedelta(days=1) _before_yester_day = _today - datetime.timedelta(days=2) # print(_yester_day) # print(_before_yester_day) logger.info("开始处理融资数据") msg += self.gene_records(_yester_day, _before_yester_day, 1) msg += '\n' logger.info("开始处理融券数据") msg += self.gene_records(_yester_day, _before_yester_day, 2) msg += '\n\n' msg += self.monitor() # print(msg) self.ding(msg)
def _create_table(self): sql = ''' CREATE TABLE IF NOT EXISTS `{}` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID', `SecuMarket` int(11) DEFAULT NULL COMMENT '证券市场', `InnerCode` int(11) NOT NULL COMMENT '证券内部编码', `InDate` datetime NOT NULL COMMENT '调入日期', `OutDate` datetime DEFAULT NULL COMMENT '调出日期', `TargetCategory` int(11) NOT NULL COMMENT '标的类别', `TargetFlag` int(11) DEFAULT NULL COMMENT '标的状态', `ChangeReasonDesc` varchar(2000) DEFAULT NULL COMMENT '变更原因描述', `UpdateTime` datetime NOT NULL COMMENT '数据源更新时间', `CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP, `UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (`id`), UNIQUE KEY `IX_MT_TargetSecurities` (`SecuMarket`, `InnerCode`,`TargetCategory`,`InDate`,`TargetFlag`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='融资融券标的证券变更记录'; '''.format(self.target_table_name) target = self._init_pool(self.product_cfg) target.insert(sql) target.dispose() logger.info("尝试建表")
def check1(): # 融资 latest_list_spider = self.dt_datas(_today, 1) latest_list_spider = set(sorted(latest_list_spider)) target = self._init_pool(self.product_cfg) sql = 'select InnerCode from {} where SecuMarket = 90 and TargetFlag = 1 and TargetCategory = 10; '.format( self.target_table_name) ret = target.select_all(sql) dc_list = set(sorted([r.get("InnerCode") for r in ret])) r1 = dc_list - latest_list_spider r2 = latest_list_spider - dc_list ''' {204806} set() ''' # delete from stk_mttargetsecurities where InnerCode = '204806'; msg = "融资一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{}".format( r1, r2) logger.info(msg) return msg
def gene_records(self, dt1, dt2, type): """ 生成数据库的一条 "融资" 变更记录 dt1 是较大的时间点 dt2 是较小的时间点 是 dt1 的前一天 """ fields = [ "SecuMarket", "InnerCode", "InDate", "OutDate", "TargetCategory", "TargetFlag", "ChangeReasonDesc" ] target = self._init_pool(self.product_cfg) to_add_set, to_delete_set = self.history_diff(dt1, dt2, type) logger.info("{} 和 {} 的 diff 结果: add: {} , delete: {}".format( dt1, dt2, to_add_set, to_delete_set)) local_str = "本地" if LOCAL else "远程" type_str = "融资" if type == 1 else "融券" msg = "深交所: {}:{} 变更记录生成: {} 和 {} 的 diff 结果: add: {} , delete: {}".format( local_str, type_str, dt1, dt2, to_add_set, to_delete_set) if to_add_set: for inner_code in to_add_set: if type == 1: item = { "SecuMarket": 90, "InnerCode": inner_code, "InDate": dt1, 'TargetCategory': 10, 'TargetFlag': 1, 'ChangeReasonDesc': '', 'UpdateTime': datetime.datetime.now(), } else: item = { "SecuMarket": 90, "InnerCode": inner_code, "InDate": dt1, 'TargetCategory': 20, 'TargetFlag': 1, 'ChangeReasonDesc': '', 'UpdateTime': datetime.datetime.now(), } count = self._save(target, item, self.target_table_name, fields) logger.info("type: {}, add 记录条数 {}".format(type, count)) if type == 1: base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory = 10 and TargetFlag = 1; ''' else: base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory = 20 and TargetFlag = 1; ''' if to_delete_set: for inner_code in to_delete_set: sql = base_sql.format(self.target_table_name, dt1, inner_code) ret = target.update(sql) logger.info("type: {}, update 记录条数是 {}".format(type, ret)) try: target.dispose() except: logger.warning("dispose error") raise return msg
def read_xls(self, file: str, dt: datetime.datetime): """ 读取单个时间点的文件 :param file: 文件路径 :param dt: 文件对应的日期 :return: """ wb = xlrd.open_workbook(file) detail = wb.sheet_by_name("融资融券标的证券信息") rows = detail.nrows - 1 logger.info("总数据量 {}".format(rows)) # 根据行数检测出没有数据的文件 if rows < 10: logger.warning("当日无数据更新 {}".format(dt)) # return True return '当日无数据更新' heads = detail.row_values(0) # print("表头信息", heads) # ['证券代码', '证券简称', '融资标的', '融券标的', '当日可融资', '当日可融券', '融券卖出价格限制'] fields = [ 'SecuMarket', 'SecuCode', 'InnerCode', 'SecuAbbr', 'SerialNumber', 'ListDate', 'FinanceBool', # 融资标的 'FinanceBuyToday', # 当日可融资 'SecurityBool', # 融券标的 'SecuritySellToday', # 当日可融券 'SecuritySellLimit', # 融券卖出价格限制 ] # list_date = datetime.datetime.strptime(str(dt), "%Y%m%d") items = [] for i in range(1, rows + 1): data = detail.row_values(i) # print(data) item = dict() item['SecuMarket'] = 90 # 深交所 secu_code = data[0] item['SecuCode'] = secu_code item['InnerCode'] = self.get_inner_code(secu_code) item['SecuAbbr'] = data[1] item['SerialNumber'] = i item['ListDate'] = dt item['FinanceBool'] = 1 if data[2] == "Y" else 0 # 融资标的 item['FinanceBuyToday'] = 1 if data[4] == "Y" else 0 # 当日可融资 item['SecurityBool'] = 1 if data[3] == 'Y' else 0 # 融券标的 item['SecuritySellToday'] = 1 if data[5] == "Y" else 0 # 当日可融券 item['SecuritySellLimit'] = 1 if data[6] == "Y" else 0 # 融券卖出价格限制 print(item) items.append(item) # client = self._init_pool(self.spider_cfg) for item in items: self.spider_conn.table_insert(self.history_table_name, item, fields) # self._save(client, item, self.history_table_name, fields) # try: # client.dispose() # except: # logger.warning("dispose error") # raise # return True return '数据更新成功'
def sz_history_task(): now = lambda: time.time() start_dt = now() SzListSpider().crawl() logger.info("耗时: {} 秒".format(now() - start_dt))
def parse_announcemen_byhuman(self): """从公告中提取更改信息 """ base_sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory in (10, 20) and TargetFlag = 1; ''' target = self._init_pool(self.product_cfg) # # (1) http://www.szse.cn/disclosure/notice/general/t20200415_575996.html # # 本所于2020年4月16日起将 南京华东电子信息科技股份有限公司股票(证券代码:000727) 调出融资融券标的证券名单 # inner_code = self.get_inner_code('000727') # print(inner_code) # 401 # (2) http://www.szse.cn/disclosure/notice/general/t20200428_576534.html # 本所于2020年4月29日起将 华映科技(集团)股份有限公司股票(证券代码:000536) 调出融资融券标的证券名单 inner_code = self.get_inner_code("000536") # 220 dt = datetime.datetime(2020, 4, 29) sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory in (10, 20) and TargetFlag = 1; '''.format( self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # (3) http://www.szse.cn/disclosure/notice/general/t20200429_576571.html # 本所于2020年4月30日起将 苏州胜利精密制造科技股份有限公司股票(证券代码:002426) 调出融资融券标的证券名单。 inner_code = self.get_inner_code('002426') # print(inner_code) # 10476 dt = datetime.datetime(2020, 4, 30) sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory in (10, 20) and TargetFlag = 1; '''.format( self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # (4) http://www.szse.cn/disclosure/notice/general/t20200429_576572.html # 本所于2020年4月30日起将 江西特种电机股份有限公司股票(证券代码:002176) 调出融资融券标的证券名单 dt = datetime.datetime(2020, 4, 30) inner_code = self.get_inner_code('002176') # print(inner_code) # 6139 sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory in (10, 20) and TargetFlag = 1; '''.format( self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # # (5) http://www.szse.cn/disclosure/notice/general/t20200430_576649.html # # 本所于2020年5月6日起将 深圳市奋达科技股份有限公司股票(证券代码:002681) 调出融资融券标的证券名单。 dt = datetime.datetime(2020, 5, 6) inner_code = self.get_inner_code('002681') # print(inner_code) # 16668 sql = '''update {} set OutDate = '{}', TargetFlag = 2 where SecuMarket = 90 and InnerCode = {}\ and TargetCategory in (10, 20) and TargetFlag = 1; '''.format( self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # # (6) http://www.szse.cn/disclosure/notice/general/t20200430_576648.html # # 本所于2020年5月6日起将 大连晨鑫网络科技股份有限公司股票(证券代码:002447) 调出融资融券标的证券名单 dt = datetime.datetime(2020, 5, 6) inner_code = self.get_inner_code("002447") # print(inner_code) # 10493 sql = base_sql.format(self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # (7) http://www.szse.cn/disclosure/notice/general/t20200430_576646.html # 本所于2020年5月6日起将 藏格控股股份有限公司股票(证券代码:000408) 调出融资融券标的证券名单。 dt = datetime.datetime(2020, 5, 6) inner_code = self.get_inner_code("000408") # print(inner_code) # 155 sql = base_sql.format(self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() # (8) http://www.szse.cn/disclosure/notice/general/t20200430_576647.html # 本所于2020年5月6日起将该 深圳市同洲电子股份有限公司股票(证券代码:002052) 调出融资融券标的证券名单。 dt = datetime.datetime(2020, 5, 6) inner_code = self.get_inner_code("002052") # print(inner_code) # 4347 sql = base_sql.format(self.target_table_name, dt, inner_code) ret = target.update(sql) print(ret) target.end() try: target.dispose() except: logger.info("dispose error") raise
SzGener().start() logger.info( f"用时: {now() - start_time} 秒") # (end)大概是 80s (dispose)大概是 425s if __name__ == "__main__": scheduler = BlockingScheduler() # 确保重启时可以执行一次 diff_task() scheduler.add_job(diff_task, 'cron', hour='16, 23', max_instances=10, id="diff_task") logger.info( 'Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) try: scheduler.start() except (KeyboardInterrupt, SystemExit): pass except Exception as e: logger.info(f"本次任务执行出错{e}") sys.exit(0) '''部署 docker build -f Dockerfile_szdiff -t registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 . docker push registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 sudo docker pull registry.cn-shenzhen.aliyuncs.com/jzdev/jzdata/margin_sz_diff:v1 # remote sudo docker run --log-opt max-size=10m --log-opt max-file=3 -itd \ --env LOCAL=0 \