def parse(self): log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始抓取新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析重新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始重新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取新疆法院诉讼服务网结束')
def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html.encode("utf8"))) doc = pq(html) lis = doc('div.doclist tr').items() object_list = list() x_lis = list() for x in lis: x_lis.append(x) text_lis = list() for i in x_lis[1:]: text_lis = list() for text in i('td').items(): text_lis.append(text.text()) item = dict() item["taskid"] = self.task_id item["bulletin_way"] = t_way item["court_num"] = text_lis[0] item["court_pur"] = text_lis[1] item["court_part"] = text_lis[2] item["start_court_t"] = text_lis[3] item["court_end_t"] = text_lis[4] item["court_status"] = text_lis[5] item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # # 返回对象列表和总页数 return object_list
def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['pageno']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["list"] for case in case_list: item = dict() item["taskid"] = self.task_id item["court_y"] = get_content(case.get("FY")) # 法院 item["court_t"] = get_content(case.get("FT")) # 法庭 item["start_court_t"] = get_content(case.get("KTRQSTRING")) # 开庭日期 item["court_num"] = get_content(case.get("AH")) # 案号 item["court_case"] = get_content(case.get("AY")) # 案由 item["trial_cause"] = get_content(case.get("SPZ")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way item["undertake_dep"] = get_content(case.get("CBBM")) item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "") item["defendant"] = get_content(case.get("BG")).replace("被告:", "") item["schedule_time"] = get_content(case.get("PQRQ")) b = BulletinCourt(**item) object_list.append(b) return object_list
def parse_html(self, html): # 解析html doc = pq(html) page_lis = doc('a').items() for pag in page_lis: if pag.text() == "尾页": total_page = "".join(re.findall("(\d*.shtml)", pag.attr.href)).replace( ".shtml", "") lis = doc('div.font14 li').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() item["release_date"] = x('span.right').text() self.http.http_session("http://hunanfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.detail') item["taskid"] = self.task_id item["title"] = content('div.detail_bigtitle').text() item["court_y"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item["court_t"] = "".join( re.findall("刑.*庭", content('div.detail_txt').text())) item["start_court_t"] = "".join( re.findall("本院定于\d{4}年.{1,5}日", content('div.detail_txt').text())).replace( "年", "-").replace("月", "-").replace("日", "").replace( "本院定于", "") item["court_num"] = "".join( re.findall("审理.*号", content('div.detail_txt').text())).replace( "审理", "") item["trial_cause"] = "".join( re.findall("合议庭成员.*\s", content('div.detail_txt').text())).replace( "合议庭成员:", "").replace("\n", "") item["court_part"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse_html(self, html): # 解析html doc = pq(html) total_page = int(doc('a.zt_02').text()[-3:]) lis = doc('div.text ul li a').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() self.http.http_session(x.attr.href, "post", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = x.text()[:16] if u"刑事" in item["title"]: item["defendant"] = "".join( re.findall("(审理.*)", content('p').text().replace("\xa0\xa0", ""))).replace( "审理", "") else: item["plaintiff"] = "".join( re.findall("(审理.*诉)", content('p').text().replace( "\xa0\xa0", ""))).replace("审理", "").replace("诉", "") item["defendant"] = "".join( re.findall("(诉.*等)", content('p').text().replace( "\xa0\xa0", ""))).replace("诉", "").replace("等", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse_html(self, html): # 解析html doc = pq(html) for page in doc('a').items(): if page.text() == "末页": total_page = "".join(re.findall("\d{1,3}", page.attr.onclick)) lis = doc('table.tabData a').items() object_list = list() for x in lis: self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.print-box') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}", content.text())).replace("发表日期:", "") item["title"] = x.attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "").replace("时", ":") item["court_num"] = "".join( re.findall("(审理.*案件)", content('p').text())).replace("审理", "").replace("案件", "") item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
def parse_html(self, html): # 解析html doc = pq(html) page_list = doc('a.zt_02').items() total_page = 10 for page in page_list: if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('div.text ul li a').items() object_list = list() for x in lis: item = dict() self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*法院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text())) item["plaintiff"] = "".join( re.findall("(审理.*诉)", content("p").text())).replace("审理", "").replace("诉", "") item["site_name"] = self.site_name date = get_today_date() if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace( "-", "")) > eval(date): # 生成文件路径 file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse_html(self, html): doc = pq(html) total_page = 10 for page in doc('a.pagination').items(): if page.text() == ">>": total_page = int("".join(re.findall("\d{2,3}", page.attr.href))) VIEWSTATE = doc("div.aspNetHidden input").attr.value lis = doc('ul.module-case-items li').items() object_list = list() for x in lis: self.http.http_session("https://www.fjcourt.gov.cn" + x('a').attr.href, "get", headers=self.headers, verify=False) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) content = doc('div.article-wrap') item = dict() item["taskid"] = self.task_id item["title"] = content('p.article-hd-title').text() item["bulletin_way"] = t_way item["court_y"] = content('span.article-author').text() item["court_t"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["start_court_t"] = x('span.cir-time').text().replace( "[", "").replace("]", "") item["court_part"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["site_name"] = self.site_name pub_time = (item["start_court_t"].replace("-", "")) date = get_today_date() if eval(pub_time) > eval(date): # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page, VIEWSTATE
def parse_html(self, html): # 解析html doc = pq(html) total_page = 10 for page in doc('div.turn_page a.zt_02').items(): if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('ul.sswy_news li').items() object_list = list() for x in lis: self.http.http_session(x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.ywzw_con_inner') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}", content('p.p_source ').text())) item["title"] = x('a').attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3.h3_title').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
def parse(self): log.info('开始抓取天津法院网') ct = 1 while ct < 30: log.info('开始抓取天津法院网第{page}页信息'.format(page=str(ct))) self.http.http_session(self.url.format(page=str(ct)), 'get', headers=self.http.headers) try: r = self.http.parse_html() log.info('解析天津法院网第{page}页信息'.format(page=str(ct))) p_list = self.parse_list(r) ic = self.is_c(r) object_list = list() for i in p_list: try: log.info('开始抓取天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)+1))) d_url = 'http://tjfy.chinacourt.org' + i['det_url'] self.http.http_session(d_url, 'get', headers=self.http.headers) rl = self.http.parse_html() log.info('解析天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) self.parse_info(rl, i) log.info('写出天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, i['html']) i['bulletin_way'] = t_way i.pop('det_url') i.pop('html') b = BulletinCourt(**i) object_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) log.info('存储天津法院网第{page}页数据'.format(page=str(ct), strip=str(p_list.index(i)))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() if ic == 0: break except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) ct += 1 self.mysql_client.session_close() log.info('开始抓取天津法院网结束')
def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html)) doc = pq(html) total_page = "".join( re.findall("共.*页\s上", doc('span').text().replace("\n", "")))[1:3] lis = doc('table.newlisttable tr').items() object_list = list() for content in lis: item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("(\(.*\))", content('td').text()))[1:-1] item["title"] = content('a').text() item["bulletin_way"] = t_way item["court_y"] = "湖北省高级人民法院" if content( 'p').text()[:4] == "本院定于" else content('p').text()[:4] item["court_t"] = "".join( re.findall("(在.*判庭)", content('p').text())).replace("在", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "") item["plaintiff"] = "".join( re.findall("(原告:.*;)", content('p').text())).replace("原告:", "") item["defendant"] = "".join( re.findall("(被告:.*的)", content('p').text())).replace("被告:", "").replace("的", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse_html(self, html): # 解析html doc = pq(html) page = doc('div.paginationControl a').eq(5).attr.href total_page = "".join(re.findall("\d{1,3}", page)) lis = doc('span.left').items() object_list = list() for x in lis: self.http.http_session("http://sxfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc('div.detail') # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(content)) item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", content('div.sth_a').text())) item["title"] = content('div.b_title').text() item["bulletin_way"] = t_way item["court_y"] = "陕西省高级人民法院" item["court_t"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["court_part"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["message"]["result"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = case.get("lastBroadcastTimeString") # 发布日期 item["title"] = get_content(case.get("caseName")) # 标题 item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("openCourtAddr")) # 法庭 item["start_court_t"] = get_content( case.get("openCourtDateString")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["case_type"] = get_content(case.get("caseTypeString")) # 案件类型 item["court_case"] = get_content( case.get("causePlacedOnFile")) # 案由 item["trial_cause"] = get_content( case.get("underJustice")).strip() # 审判人员 try: dex = case["litigants"].index("被告:") item["plaintiff"] = case["litigants"][:dex].replace( "原告:", "")[:-1] # 原告 item["defendant"] = case["litigants"][dex:].replace("被告:", "") # 被告 except: item["plaintiff"] = "" item["defendant"] = case.get("litigants") item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list
def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["data"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = get_content(case.get("createDate")) item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("trialCourt")) # 法庭 item["start_court_t"] = get_content(case.get("courtTime")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["court_case"] = get_content(case.get("caseDesc")) # 案由 item["trial_cause"] = get_content(case.get("judge")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list
def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['page']))) object_list = list() case_list = json_data["data"] for case in case_list: if "开庭公告" in html.unescape(case["ggbt"]): item = dict() item["release_date"] = case["clsj"] formdata = { "ggsdid": "{}".format(str(case['ggsdid'])), "ssfy": "{}".format(str(case['fydm'])) } ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do" self.http.http_session(ur, "post", data=formdata, headers=self.headers) json = self.http.parse_json()["data"] item["taskid"] = self.task_id item["release_date"] = html.unescape(json.get("CLSJ")) item["title"] = html.unescape(json.get("GGBT")) item["court_y"] = get_content(json.get("SSFYMC")) # 法院 content = html.unescape(json.get("GGNR")) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(content)) item["court_t"] = "".join(re.findall("法院.{1,10}庭", content)).replace( "法院", "") item["court_num"] = html.unescape(json.get("AH")) # 案号 item["trial_cause"] = html.unescape( json.get("CBRXM").strip()) # 审判人员 item['bulletin_way'] = t_way item["site_name"] = self.site_name b = BulletinCourt(**item) object_list.append(b)
def parse_html(self, html): # 解析html doc = pq(html) # print(doc("td.td_pagebar").text()) total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace( "共", "").replace("页", "").strip() lis = doc('td.td_line').items() object_list = list() for x in lis: if "开庭" in x.text(): self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc item = dict() item["taskid"] = self.task_id item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text())) item["title"] = x.text() t_way = self.task_id + str(time.time()) + '.txt' item["bulletin_way"] = t_way item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text())) item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") item["site_name"] = self.site_name # print(item) if eval(item["release_date"].replace("-", "")) > eval("20180101"): file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
def parse(self): log.info('开始抓取吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://www.jlsfy.gov.cn' + skip.attr('href').replace('&', '&')\ .replace('pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储吉林省高级人民法院司法公开网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储林省高级人民法院司法公开网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取林省高级人民法院司法公开网结束')
def parse(self): log.info('开始抓取河北法院网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://hbgy.hbsfgk.org' + skip.attr('href').replace( 'pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取河北法院网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) time0 = get_today_date() time1 = self.get_n_t(r) strftime0 = datetime.datetime.strptime(time1, "%Y-%m-%d") strftime1 = datetime.datetime.strptime(time0, "%Y-%m-%d") fg = strftime1 > strftime0 if fg == True: break self.mysql_client.session_close() log.info('抓取河北法院网结束')
def parse(self): log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page='1'), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储天津法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) print(p_total) for total in range(2, p_total): try: log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储重庆法院公共服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取重庆法院公共服务网结束')
def parse(self): log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(page='1', start=str(ts), end=str(tm)), 'get', headers=self.http.headers) self.http.set_charset('gb2312') r = self.http.parse_html() print(r) log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) break log.info('存储黑龙江法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total), start=str(ts), end=str(tm)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[ 'det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取黑龙江法院网结束')