class Spider(MainSpider): def __init__(self): self.task_id = "sichuan" self.site_name = "四川法院司法公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""} url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['page']))) self.http.set_charset("unicode") self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 # break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['page']))) object_list = list() case_list = json_data["data"] for case in case_list: if "开庭公告" in html.unescape(case["ggbt"]): item = dict() item["release_date"] = case["clsj"] formdata = { "ggsdid": "{}".format(str(case['ggsdid'])), "ssfy": "{}".format(str(case['fydm'])) } ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do" self.http.http_session(ur, "post", data=formdata, headers=self.headers) json = self.http.parse_json()["data"] item["taskid"] = self.task_id item["release_date"] = html.unescape(json.get("CLSJ")) item["title"] = html.unescape(json.get("GGBT")) item["court_y"] = get_content(json.get("SSFYMC")) # 法院 content = html.unescape(json.get("GGNR")) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(content)) item["court_t"] = "".join(re.findall("法院.{1,10}庭", content)).replace( "法院", "") item["court_num"] = html.unescape(json.get("AH")) # 案号 item["trial_cause"] = html.unescape( json.get("CBRXM").strip()) # 审判人员 item['bulletin_way'] = t_way item["site_name"] = self.site_name b = BulletinCourt(**item) object_list.append(b) return object_list
class Spider(MainSpider): def __init__(self): self.task_id = "zhejiang" self.site_name = "浙江法院公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "pageno": "1", "pagesize": "10", "cbfy": "全部", "dsr": "", "spz": "", "jarq1": "", "jarq2": "" } url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['pageno']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["pageno"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['pageno']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["list"] for case in case_list: item = dict() item["taskid"] = self.task_id item["court_y"] = get_content(case.get("FY")) # 法院 item["court_t"] = get_content(case.get("FT")) # 法庭 item["start_court_t"] = get_content(case.get("KTRQSTRING")) # 开庭日期 item["court_num"] = get_content(case.get("AH")) # 案号 item["court_case"] = get_content(case.get("AY")) # 案由 item["trial_cause"] = get_content(case.get("SPZ")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way item["undertake_dep"] = get_content(case.get("CBBM")) item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "") item["defendant"] = get_content(case.get("BG")).replace("被告:", "") item["schedule_time"] = get_content(case.get("PQRQ")) b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["total"] return int(total_page) // 10 except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "jiangxi" self.site_name = "江西庭审公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): date = get_today_date() form = { 'isGeneral': 'Y', 'belongOrgId': '', 'liveStatus': '001', 'page.pageSize': '20', 'page.pageNo': '1', 'gopenCourtDate': date + ' 00:00:00', 'page.orderBy': 'openCourtDate', 'page.order': 'asc', 'caseType': '', 'searchWord': '' } url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action" log.info("开始抓取==============江西庭审公开网") log.info("开始抓取==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page.pageNo"] = i log.info("开始抓取==============江西庭审公开网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format( str(form['page.pageNo'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])), self.task_id, url, self.site_name) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["message"]["result"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = case.get("lastBroadcastTimeString") # 发布日期 item["title"] = get_content(case.get("caseName")) # 标题 item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("openCourtAddr")) # 法庭 item["start_court_t"] = get_content( case.get("openCourtDateString")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["case_type"] = get_content(case.get("caseTypeString")) # 案件类型 item["court_case"] = get_content( case.get("causePlacedOnFile")) # 案由 item["trial_cause"] = get_content( case.get("underJustice")).strip() # 审判人员 try: dex = case["litigants"].index("被告:") item["plaintiff"] = case["litigants"][:dex].replace( "原告:", "")[:-1] # 原告 item["defendant"] = case["litigants"][dex:].replace("被告:", "") # 被告 except: item["plaintiff"] = "" item["defendant"] = case.get("litigants") item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["message"]["totalPages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "hainan" self.site_name = "天涯法律网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): today_date = get_today_date() next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:] form = { "currentPageNo": "1", "pageSize": "10", "startDate": today_date, "endDate": next_year_today_date, "caseNo": "", "litigant": "", "judge": "", "caseDesc": "", "siteId": "f7afc746-8577-4cd4-a410-884027df5bab" } url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["currentPageNo"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo']) ), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo']) ), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["data"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = get_content(case.get("createDate")) item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("trialCourt")) # 法庭 item["start_court_t"] = get_content(case.get("courtTime")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["court_case"] = get_content(case.get("caseDesc")) # 案由 item["trial_cause"] = get_content(case.get("judge")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["pages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0