class Spider(MainSpider): def __init__(self): self.task_id = "shanxi" self.site_name = "山西法院诉讼服务网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"channelId": "307", "listsize": "238", "pagego": "1"} url = "http://www.shanxify.gov.cn/ktggPage.jspx" log.info("开始抓取==============山西法院诉讼服务网") log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(str( form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============山西法院诉讼服务网,第{}页".format( str(form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info( "开始存储==============抓取山西法院诉讼服务网,第{}页".format(i)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(str(form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取山西法院诉讼服务网结束") def parse_html(self, html): # 解析html doc = pq(html) total_page = int(doc('a.zt_02').text()[-3:]) lis = doc('div.text ul li a').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() self.http.http_session(x.attr.href, "post", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = x.text()[:16] if u"刑事" in item["title"]: item["defendant"] = "".join( re.findall("(审理.*)", content('p').text().replace("\xa0\xa0", ""))).replace( "审理", "") else: item["plaintiff"] = "".join( re.findall("(审理.*诉)", content('p').text().replace( "\xa0\xa0", ""))).replace("审理", "").replace("诉", "") item["defendant"] = "".join( re.findall("(诉.*等)", content('p').text().replace( "\xa0\xa0", ""))).replace("诉", "").replace("等", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "sanxi" self.site_name = "陕西法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): page = 1 url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, page)) self.http.http_requst(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() # for i in range(2, int(total_page) + 1): try: page = i url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={},第{}页".format( self.site_name, page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) page = doc('div.paginationControl a').eq(5).attr.href total_page = "".join(re.findall("\d{1,3}", page)) lis = doc('span.left').items() object_list = list() for x in lis: self.http.http_session("http://sxfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc('div.detail') # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(content)) item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", content('div.sth_a').text())) item["title"] = content('div.b_title').text() item["bulletin_way"] = t_way item["court_y"] = "陕西省高级人民法院" item["court_t"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["court_part"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "guizhou" self.site_name = "贵州法院公众服务平台" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""} url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx" log.info("开始抓取==============贵州法院公众服务平台") log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["page"] = i log.info("开始抓取==============贵州法院公众服务平台,第{}页".format( str(form['page']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取贵州法院公众服务平台结束") def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) for page in doc('a').items(): if page.text() == "末页": total_page = "".join(re.findall("\d{1,3}", page.attr.onclick)) lis = doc('table.tabData a').items() object_list = list() for x in lis: self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.print-box') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}", content.text())).replace("发表日期:", "") item["title"] = x.attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "").replace("时", ":") item["court_num"] = "".join( re.findall("(审理.*案件)", content('p').text())).replace("审理", "").replace("案件", "") item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "sichuan" self.site_name = "四川法院司法公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""} url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['page']))) self.http.set_charset("unicode") self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 # break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['page']))) object_list = list() case_list = json_data["data"] for case in case_list: if "开庭公告" in html.unescape(case["ggbt"]): item = dict() item["release_date"] = case["clsj"] formdata = { "ggsdid": "{}".format(str(case['ggsdid'])), "ssfy": "{}".format(str(case['fydm'])) } ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do" self.http.http_session(ur, "post", data=formdata, headers=self.headers) json = self.http.parse_json()["data"] item["taskid"] = self.task_id item["release_date"] = html.unescape(json.get("CLSJ")) item["title"] = html.unescape(json.get("GGBT")) item["court_y"] = get_content(json.get("SSFYMC")) # 法院 content = html.unescape(json.get("GGNR")) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(content)) item["court_t"] = "".join(re.findall("法院.{1,10}庭", content)).replace( "法院", "") item["court_num"] = html.unescape(json.get("AH")) # 案号 item["trial_cause"] = html.unescape( json.get("CBRXM").strip()) # 审判人员 item['bulletin_way'] = t_way item["site_name"] = self.site_name b = BulletinCourt(**item) object_list.append(b) return object_list
class Spider(MainSpider): def __init__(self): self.task_id = "qinghai" self.site_name = "青海法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers self.http.set_charset("gbk") def parse(self): form = { "p": "1", "LocationID": "0700000000", "sub": "" } url = "http://qhfy.chinacourt.org/fygg/index.php" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page)+1): try: form["p"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, ( form['p'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) # print(doc("td.td_pagebar").text()) total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace( "共", "").replace("页", "").strip() lis = doc('td.td_line').items() object_list = list() for x in lis: if "开庭" in x.text(): self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc item = dict() item["taskid"] = self.task_id item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text())) item["title"] = x.text() t_way = self.task_id + str(time.time()) + '.txt' item["bulletin_way"] = t_way item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text())) item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") item["site_name"] = self.site_name # print(item) if eval(item["release_date"].replace("-", "")) > eval("20180101"): file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "hunan" self.site_name = "湖南法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): page = 1 url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取==============湖南法院网") log.info("开始抓取==============湖南法院网,第{}页".format(page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============山西法院诉讼服务网,第{}页".format(page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): page = i try: log.info("开始抓取==============湖南法院网,第{}页".format(page)) url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format( page) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info( "开始存储==============山西法院诉讼服务网,第{}页".format(page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖南法院网结束") def parse_html(self, html): # 解析html doc = pq(html) page_lis = doc('a').items() for pag in page_lis: if pag.text() == "尾页": total_page = "".join(re.findall("(\d*.shtml)", pag.attr.href)).replace( ".shtml", "") lis = doc('div.font14 li').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() item["release_date"] = x('span.right').text() self.http.http_session("http://hunanfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.detail') item["taskid"] = self.task_id item["title"] = content('div.detail_bigtitle').text() item["court_y"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item["court_t"] = "".join( re.findall("刑.*庭", content('div.detail_txt').text())) item["start_court_t"] = "".join( re.findall("本院定于\d{4}年.{1,5}日", content('div.detail_txt').text())).replace( "年", "-").replace("月", "-").replace("日", "").replace( "本院定于", "") item["court_num"] = "".join( re.findall("审理.*号", content('div.detail_txt').text())).replace( "审理", "") item["trial_cause"] = "".join( re.findall("合议庭成员.*\s", content('div.detail_txt').text())).replace( "合议庭成员:", "").replace("\n", "") item["court_part"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "guangdong" self.site_name = "广东法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "action": "gotoggxxcx", "gglx": "ktgg", "flag": "first" } url = "http://www.gdcourts.gov.cn/web/search" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['flag']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['flag']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html.encode("utf8"))) doc = pq(html) lis = doc('div.doclist tr').items() object_list = list() x_lis = list() for x in lis: x_lis.append(x) text_lis = list() for i in x_lis[1:]: text_lis = list() for text in i('td').items(): text_lis.append(text.text()) item = dict() item["taskid"] = self.task_id item["bulletin_way"] = t_way item["court_num"] = text_lis[0] item["court_pur"] = text_lis[1] item["court_part"] = text_lis[2] item["start_court_t"] = text_lis[3] item["court_end_t"] = text_lis[4] item["court_status"] = text_lis[5] item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # # 返回对象列表和总页数 return object_list
class Spider(MainSpider): def __init__(self): self.task_id = "jiangxi" self.site_name = "江西庭审公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): date = get_today_date() form = { 'isGeneral': 'Y', 'belongOrgId': '', 'liveStatus': '001', 'page.pageSize': '20', 'page.pageNo': '1', 'gopenCourtDate': date + ' 00:00:00', 'page.orderBy': 'openCourtDate', 'page.order': 'asc', 'caseType': '', 'searchWord': '' } url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action" log.info("开始抓取==============江西庭审公开网") log.info("开始抓取==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page.pageNo"] = i log.info("开始抓取==============江西庭审公开网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format( str(form['page.pageNo'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])), self.task_id, url, self.site_name) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["message"]["result"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = case.get("lastBroadcastTimeString") # 发布日期 item["title"] = get_content(case.get("caseName")) # 标题 item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("openCourtAddr")) # 法庭 item["start_court_t"] = get_content( case.get("openCourtDateString")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["case_type"] = get_content(case.get("caseTypeString")) # 案件类型 item["court_case"] = get_content( case.get("causePlacedOnFile")) # 案由 item["trial_cause"] = get_content( case.get("underJustice")).strip() # 审判人员 try: dex = case["litigants"].index("被告:") item["plaintiff"] = case["litigants"][:dex].replace( "原告:", "")[:-1] # 原告 item["defendant"] = case["litigants"][dex:].replace("被告:", "") # 被告 except: item["plaintiff"] = "" item["defendant"] = case.get("litigants") item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["message"]["totalPages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "yunan" self.site_name = "云南法院司法信息网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"channelId": "858", "listsize": "673", "pagego": "1"} url = "http://www.ynfy.gov.cn/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagego']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) total_page = 10 for page in doc('div.turn_page a.zt_02').items(): if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('ul.sswy_news li').items() object_list = list() for x in lis: self.http.http_session(x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.ywzw_con_inner') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}", content('p.p_source ').text())) item["title"] = x('a').attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3.h3_title').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "hubei" self.site_name = "湖北省高级人民法院" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"folderNo": "0401", "pageIndex": "1"} url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder" log.info("开始抓取==============湖北省高级人民法院") log.info("开始抓取==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pageIndex"] = i log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖北省高级人民法院结束") def added_parse(self): pass def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html)) doc = pq(html) total_page = "".join( re.findall("共.*页\s上", doc('span').text().replace("\n", "")))[1:3] lis = doc('table.newlisttable tr').items() object_list = list() for content in lis: item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("(\(.*\))", content('td').text()))[1:-1] item["title"] = content('a').text() item["bulletin_way"] = t_way item["court_y"] = "湖北省高级人民法院" if content( 'p').text()[:4] == "本院定于" else content('p').text()[:4] item["court_t"] = "".join( re.findall("(在.*判庭)", content('p').text())).replace("在", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "") item["plaintiff"] = "".join( re.findall("(原告:.*;)", content('p').text())).replace("原告:", "") item["defendant"] = "".join( re.findall("(被告:.*的)", content('p').text())).replace("被告:", "").replace("的", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "gansu" self.site_name = "甘肃省高级人民法院司法公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "channelId": "307", "listsize": "100", "pagecur": "0", "pagego": "add" } url = "http://gsgf.gssfgk.com/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagecur']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagecur']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(1, int(total_page) + 1): try: form["pagecur"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagecur']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagecur']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) page_list = doc('a.zt_02').items() total_page = 10 for page in page_list: if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('div.text ul li a').items() object_list = list() for x in lis: item = dict() self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*法院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text())) item["plaintiff"] = "".join( re.findall("(审理.*诉)", content("p").text())).replace("审理", "").replace("诉", "") item["site_name"] = self.site_name date = get_today_date() if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace( "-", "")) > eval(date): # 生成文件路径 file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "zhejiang" self.site_name = "浙江法院公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "pageno": "1", "pagesize": "10", "cbfy": "全部", "dsr": "", "spz": "", "jarq1": "", "jarq2": "" } url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['pageno']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["pageno"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['pageno']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["list"] for case in case_list: item = dict() item["taskid"] = self.task_id item["court_y"] = get_content(case.get("FY")) # 法院 item["court_t"] = get_content(case.get("FT")) # 法庭 item["start_court_t"] = get_content(case.get("KTRQSTRING")) # 开庭日期 item["court_num"] = get_content(case.get("AH")) # 案号 item["court_case"] = get_content(case.get("AY")) # 案由 item["trial_cause"] = get_content(case.get("SPZ")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way item["undertake_dep"] = get_content(case.get("CBBM")) item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "") item["defendant"] = get_content(case.get("BG")).replace("被告:", "") item["schedule_time"] = get_content(case.get("PQRQ")) b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["total"] return int(total_page) // 10 except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "fujian" self.site_name = "福建省高级人民法院法院公告" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): url = "https://www.fjcourt.gov.cn/page/public/courtreport.html" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, 1)) self.http.http_requst(url, "get", headers=self.headers, verify=False) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, 1)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): form = { "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": "54969BDC", "__EVENTTARGET": "ctl00$cplContent$AspNetPager1", } try: form["__EVENTARGUMENT"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html( html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['__EVENTARGUMENT'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, 1), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): doc = pq(html) total_page = 10 for page in doc('a.pagination').items(): if page.text() == ">>": total_page = int("".join(re.findall("\d{2,3}", page.attr.href))) VIEWSTATE = doc("div.aspNetHidden input").attr.value lis = doc('ul.module-case-items li').items() object_list = list() for x in lis: self.http.http_session("https://www.fjcourt.gov.cn" + x('a').attr.href, "get", headers=self.headers, verify=False) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) content = doc('div.article-wrap') item = dict() item["taskid"] = self.task_id item["title"] = content('p.article-hd-title').text() item["bulletin_way"] = t_way item["court_y"] = content('span.article-author').text() item["court_t"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["start_court_t"] = x('span.cir-time').text().replace( "[", "").replace("]", "") item["court_part"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["site_name"] = self.site_name pub_time = (item["start_court_t"].replace("-", "")) date = get_today_date() if eval(pub_time) > eval(date): # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page, VIEWSTATE
class Spider(MainSpider): def __init__(self): self.task_id = "hainan" self.site_name = "天涯法律网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): today_date = get_today_date() next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:] form = { "currentPageNo": "1", "pageSize": "10", "startDate": today_date, "endDate": next_year_today_date, "caseNo": "", "litigant": "", "judge": "", "caseDesc": "", "siteId": "f7afc746-8577-4cd4-a410-884027df5bab" } url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["currentPageNo"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo']) ), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo']) ), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["data"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = get_content(case.get("createDate")) item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("trialCourt")) # 法庭 item["start_court_t"] = get_content(case.get("courtTime")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["court_case"] = get_content(case.get("caseDesc")) # 案由 item["trial_cause"] = get_content(case.get("judge")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["pages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0