コード例 #1
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sichuan"
        self.site_name = "四川法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""}

        url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['page'])))
        self.http.set_charset("unicode")
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['page'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['page'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['page'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                # break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['page'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['page'])))

        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            if "开庭公告" in html.unescape(case["ggbt"]):
                item = dict()
                item["release_date"] = case["clsj"]
                formdata = {
                    "ggsdid": "{}".format(str(case['ggsdid'])),
                    "ssfy": "{}".format(str(case['fydm']))
                }
                ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do"
                self.http.http_session(ur,
                                       "post",
                                       data=formdata,
                                       headers=self.headers)
                json = self.http.parse_json()["data"]
                item["taskid"] = self.task_id
                item["release_date"] = html.unescape(json.get("CLSJ"))
                item["title"] = html.unescape(json.get("GGBT"))
                item["court_y"] = get_content(json.get("SSFYMC"))  # 法院
                content = html.unescape(json.get("GGNR"))
                t_way = self.task_id + str(time.time()) + '.txt'
                file_out(t_way, str(content))
                item["court_t"] = "".join(re.findall("法院.{1,10}庭",
                                                     content)).replace(
                                                         "法院", "")
                item["court_num"] = html.unescape(json.get("AH"))  # 案号
                item["trial_cause"] = html.unescape(
                    json.get("CBRXM").strip())  # 审判人员
                item['bulletin_way'] = t_way
                item["site_name"] = self.site_name
                b = BulletinCourt(**item)
                object_list.append(b)
        return object_list
コード例 #2
0
ファイル: spiderr.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "qinghai"
        self.site_name = "青海法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers
        self.http.set_charset("gbk")

    def parse(self):

        form = {
            "p": "1",
            "LocationID": "0700000000",
            "sub": ""
        }

        url = "http://qhfy.chinacourt.org/fygg/index.php"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page)+1):
                try:
                    form["p"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取{},第{}页异常".format(self.site_name, (
                            form['p'])), self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        # print(doc("td.td_pagebar").text())
        total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace(
            "共", "").replace("页", "").strip()
        lis = doc('td.td_line').items()
        object_list = list()
        for x in lis:
            if "开庭" in x.text():
                self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers)
                htm = self.http.parse_html()
                doc = pq(htm)
                content = doc
                item = dict()
                item["taskid"] = self.task_id
                item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text()))
                item["title"] = x.text()
                t_way = self.task_id + str(time.time()) + '.txt'
                item["bulletin_way"] = t_way
                item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text()))
                item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title))
                item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                item["site_name"] = self.site_name
                # print(item)
                if eval(item["release_date"].replace("-", "")) > eval("20180101"):
                    file_out(t_way, str(htm))
                    # 将item字典映射成对象
                    b = BulletinCourt(**item)
                    object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #3
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "hunan"
        self.site_name = "湖南法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        page = 1
        url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format(
            page)
        log.info("开始抓取==============湖南法院网")
        log.info("开始抓取==============湖南法院网,第{}页".format(page))
        self.http.http_session(url, "get", headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============山西法院诉讼服务网,第{}页".format(page))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                page = i
                try:
                    log.info("开始抓取==============湖南法院网,第{}页".format(page))
                    url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format(
                        page)
                    self.http.http_session(url, "get", headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info(
                            "开始存储==============山西法院诉讼服务网,第{}页".format(page))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取湖南法院网,第{}页异常".format(page),
                                        self.task_id, url, self.site_name)
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break

        else:
            SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url,
                            self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取湖南法院网结束")

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_lis = doc('a').items()
        for pag in page_lis:
            if pag.text() == "尾页":
                total_page = "".join(re.findall("(\d*.shtml)",
                                                pag.attr.href)).replace(
                                                    ".shtml", "")
        lis = doc('div.font14 li').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            item["release_date"] = x('span.right').text()
            self.http.http_session("http://hunanfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.detail')
            item["taskid"] = self.task_id
            item["title"] = content('div.detail_bigtitle').text()
            item["court_y"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("刑.*庭",
                           content('div.detail_txt').text()))
            item["start_court_t"] = "".join(
                re.findall("本院定于\d{4}年.{1,5}日",
                           content('div.detail_txt').text())).replace(
                               "年", "-").replace("月",
                                                 "-").replace("日", "").replace(
                                                     "本院定于", "")
            item["court_num"] = "".join(
                re.findall("审理.*号",
                           content('div.detail_txt').text())).replace(
                               "审理", "")
            item["trial_cause"] = "".join(
                re.findall("合议庭成员.*\s",
                           content('div.detail_txt').text())).replace(
                               "合议庭成员:", "").replace("\n", "")
            item["court_part"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item['site_name'] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #4
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "guangdong"
        self.site_name = "广东法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "action": "gotoggxxcx",
            "gglx": "ktgg",
            "flag": "first"
        }

        url = "http://www.gdcourts.gov.cn/web/search"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['flag'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['flag'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name)

        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html.encode("utf8")))

        doc = pq(html)
        lis = doc('div.doclist tr').items()
        object_list = list()
        x_lis = list()
        for x in lis:
            x_lis.append(x)
        text_lis = list()
        for i in x_lis[1:]:
            text_lis = list()
            for text in i('td').items():
                text_lis.append(text.text())
            item = dict()
            item["taskid"] = self.task_id
            item["bulletin_way"] = t_way
            item["court_num"] = text_lis[0]
            item["court_pur"] = text_lis[1]
            item["court_part"] = text_lis[2]
            item["start_court_t"] = text_lis[3]
            item["court_end_t"] = text_lis[4]
            item["court_status"] = text_lis[5]
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # # 返回对象列表和总页数
        return object_list
コード例 #5
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "shanxi"
        self.site_name = "山西法院诉讼服务网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        form = {"channelId": "307", "listsize": "238", "pagego": "1"}

        url = "http://www.shanxify.gov.cn/ktggPage.jspx"
        log.info("开始抓取==============山西法院诉讼服务网")
        log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(str(
            form['pagego'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============山西法院诉讼服务网,第{}页".format(
                str(form['pagego'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["pagego"] = i
                    log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info(
                            "开始存储==============抓取山西法院诉讼服务网,第{}页".format(i))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(i),
                                        self.task_id, url, self.site_name)
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break

        else:
            SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(str(form['pagego'])),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取山西法院诉讼服务网结束")

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = int(doc('a.zt_02').text()[-3:])
        lis = doc('div.text ul li a').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            self.http.http_session(x.attr.href, "post", headers=self.headers)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = x.text()[:16]
            if u"刑事" in item["title"]:
                item["defendant"] = "".join(
                    re.findall("(审理.*)",
                               content('p').text().replace("\xa0\xa0",
                                                           ""))).replace(
                                                               "审理", "")
            else:
                item["plaintiff"] = "".join(
                    re.findall("(审理.*诉)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("审理", "").replace("诉", "")
                item["defendant"] = "".join(
                    re.findall("(诉.*等)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("诉", "").replace("等", "")
            item['site_name'] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
            # 返回对象列表和总页数
        return object_list, total_page
コード例 #6
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "fujian"
        self.site_name = "福建省高级人民法院法院公告"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        url = "https://www.fjcourt.gov.cn/page/public/courtreport.html"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, 1))
        self.http.http_requst(url, "get", headers=self.headers, verify=False)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page, VIEWSTATE = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, 1))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()

            for i in range(2, int(total_page) + 1):
                form = {
                    "__VIEWSTATE": VIEWSTATE,
                    "__VIEWSTATEGENERATOR": "54969BDC",
                    "__EVENTTARGET": "ctl00$cplContent$AspNetPager1",
                }
                try:
                    form["__EVENTARGUMENT"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['__EVENTARGUMENT'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page, VIEWSTATE = self.parse_html(
                            html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['__EVENTARGUMENT'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['__EVENTARGUMENT'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, 1),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):

        doc = pq(html)
        total_page = 10
        for page in doc('a.pagination').items():
            if page.text() == ">>":
                total_page = int("".join(re.findall("\d{2,3}",
                                                    page.attr.href)))
        VIEWSTATE = doc("div.aspNetHidden input").attr.value
        lis = doc('ul.module-case-items li').items()
        object_list = list()
        for x in lis:
            self.http.http_session("https://www.fjcourt.gov.cn" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers,
                                   verify=False)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            content = doc('div.article-wrap')
            item = dict()
            item["taskid"] = self.task_id
            item["title"] = content('p.article-hd-title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = content('span.article-author').text()
            item["court_t"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["start_court_t"] = x('span.cir-time').text().replace(
                "[", "").replace("]", "")
            item["court_part"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["site_name"] = self.site_name
            pub_time = (item["start_court_t"].replace("-", ""))
            date = get_today_date()
            if eval(pub_time) > eval(date):
                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page, VIEWSTATE
コード例 #7
0
class Spider(MainSpider):

    site_name = '新疆法院诉讼服务网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://220.171.35.30/ktggSearchResult.jspx?fyid=&ktdd=&page={page}'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析新疆法院诉讼服务网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            d_url = p['det_url']
            log.info('开始抓取新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1)))
            self.http.http_session(d_url, 'get', headers=self.http.headers)
            det_mess = self.http.parse_html()
            log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1)))
            self.parse_info(det_mess, p)
            t_way = self.taskid + str(time.time()) + '.txt'
            file_out(t_way, p['html'])
            p['bulletin_way'] = t_way
            p.pop('det_url')
            p.pop('html')
            p['taskid'] = self.taskid
            b = BulletinCourt(**p)
            b_list.append(b)
        log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(page=str(total)), 'get', headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析重新疆法院诉讼服务网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    d_url = p['det_url']
                    log.info('开始重新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total),
                                                                      strip=str(p_list.index(p) + 1)))
                    self.http.http_session(d_url, 'get', headers=self.http.headers)
                    det_mess = self.http.parse_html()
                    log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total),
                                                                     strip=str(p_list.index(p) + 1)))
                    self.parse_info(det_mess, p)
                    t_way = self.taskid + str(time.time()) + '.txt'
                    file_out(t_way, p['html'])
                    p['bulletin_way'] = t_way
                    p.pop('det_url')
                    p.pop('html')
                    p['taskid'] = self.taskid
                    b = BulletinCourt(**p)
                    b_list.append(b)
                log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)
        self.mysql_client.session_close()
        log.info('抓取新疆法院诉讼服务网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        doc = pq(r)
        trs = doc('table tr')
        p_list = list()
        for i in range(1, trs.size()):
            item = dict()
            tr = trs.eq(i)
            td1 = tr('td').eq(1)
            item['det_url'] = td1('a').attr('href')
            item['title'] = td1('a').attr('title')
            item['court_y'] = tr('td').eq(2).text()
            item['start_court_t'] = tr('td').eq(3).text()
            p_list.append(item)
        return p_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        title = doc('title').text()
        con = doc('div.con')
        c_title = con('div.title').text()
        court = con('origin').text()
        p = con('div.content').children("p")
        c_html = ''
        for var in p.items():
            c_html += var.text() + '\r\n'
        html = title + '\r\n' + c_title + '\r\n' + court + '\r\n' + c_html
        item['html'] = html

    def page_total(self, res):
        try:
            doc = pq(res)
            jump = doc('div.jump div.skip').children('a')
            len = jump.eq(jump.length - 1)
            k = int(len.attr('onclick').replace('turnPage(', '').replace(')', ''))
            return k
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
コード例 #8
0
class Spider(MainSpider):

    site_name = '天津法院网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://tjfy.chinacourt.org/article/index/id/MzDIMTCwMDAwNCACAAA%3D/page/{page}.shtml'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取天津法院网')
        ct = 1
        while ct < 30:
            log.info('开始抓取天津法院网第{page}页信息'.format(page=str(ct)))
            self.http.http_session(self.url.format(page=str(ct)), 'get', headers=self.http.headers)
            try:
                r = self.http.parse_html()
                log.info('解析天津法院网第{page}页信息'.format(page=str(ct)))
                p_list = self.parse_list(r)
                ic = self.is_c(r)
                object_list = list()
                for i in p_list:
                    try:
                        log.info('开始抓取天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                       strip=str(p_list.index(i)+1)))
                        d_url = 'http://tjfy.chinacourt.org' + i['det_url']
                        self.http.http_session(d_url, 'get', headers=self.http.headers)
                        rl = self.http.parse_html()

                        log.info('解析天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                     strip=str(p_list.index(i))))
                        self.parse_info(rl, i)
                        log.info('写出天津法院网第{page},第{strip}条信息'.format(page=str(ct),
                                                                     strip=str(p_list.index(i))))
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, i['html'])
                        i['bulletin_way'] = t_way
                        i.pop('det_url')
                        i.pop('html')
                        b = BulletinCourt(**i)
                        object_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, self.url)
                log.info('存储天津法院网第{page}页数据'.format(page=str(ct), strip=str(p_list.index(i))))
                self.mysql_client.session_insert_list(object_list)
                self.mysql_client.session_commit()
                if ic == 0:
                    break
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)
            ct += 1
        self.mysql_client.session_close()
        log.info('开始抓取天津法院网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        doc = pq(r)
        main = doc('div#main')
        ul = main('ul li').items()
        p_list = list()
        for l in ul:
            item = dict()
            hr = l('a').attr('href')
            title = l('a').attr('title')
            time = l('span.right').text()
            item["taskid"] = '111111111'
            item['det_url'] = hr
            item['start_court_t'] = time
            p_list.append(item)
        return p_list

    def parse_info(self, rs, item):
        rr = pq(rs)
        det = rr('div.detail')
        tit = det('div.title')
        title = tit('div.b_title').text()
        txt = tit('div.sth_a span').eq(0).text()
        time = txt.split(':')[2].strip()
        cont = det('div.text').text()
        html = title + '\r\n' + txt + '\r\n' + cont
        item['release_date'] = time
        item['html'] = html
        item['title'] = title

    def is_c(self, res):
        try:
            doc = pq(res)
            d = doc('#category .paginationControl').eq(0)
            c = int(d('.current').text())
            a = d('a')

            count = 0
            for var in a.items():
                count = count + 1
                s = var.text()
                if s == '下一页':
                    break

            t = a.eq(count - 2)
            ts = int(t.text())
            if ts <= c:
                return 0
            else:
                return 1
        except Exception:
            return 1
コード例 #9
0
class Spider(MainSpider):

    site_name = '内蒙古自治区高级人民法院司法公开网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://www.nmgfy.gov.cn/fygg/index.jhtml'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page='1'))
        doc = pq(r)
        skip = doc('div.turn_page').children('p').children('a')
        nurl = 'http://www.nmgfy.gov.cn' + skip.eq(skip.length - 1).attr('href').replace('&amp;', '&')\
            .replace('pagecur=1', 'pagecur={pageno}')
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = p['det_url']
                log.info('开始抓取内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
        log.info('存储内蒙古自治区高级人民法院司法公开网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info(
                    '开始抓取内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                self.http.http_session(nurl.format(pageno=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info(
                    '解析内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = p['det_url']
                        log.info(
                            '开始抓取内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format(
                                page=str(total),
                                strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info(
                            '解析内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format(
                                page=str(total),
                                strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)
                log.info(
                    '存储宁内蒙古自治区高级人民法院司法公开网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)

        self.mysql_client.session_close()
        log.info('抓取内蒙古自治区高级人民法院司法公开网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        p_list = list()
        doc = pq(r)
        sec = doc('ul.sswy_news').children('li')
        for var in sec.items():
            item = dict()
            det_url = var('a').attr('href')
            title = var('a').attr('title')
            start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}',
                                      title).group()
            item['det_url'] = det_url
            item['title'] = title
            item['start_court_t'] = start_court_t
            p_list.append(item)
        return p_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        con = doc('div.ywzw_con_inner')
        p_source = con('p.p_source').text()
        title = con('h3.h3_title').text()
        release_date = p_source.split(' 来源:')[0].strip()
        p_notice = con('p.p_notice').text()
        p_text = con('p.p_text').text()
        start_court_t = re.search('\d{4}年\d{2}月\d{2}', p_text).group().replace(
            '年', '-').replace('月', '-')
        p_tcgg = con('p.tcgg').text()
        p_date = con('p.p_date').text()
        court_y = title
        html = p_source.replace(
            '\u3000', ' '
        ) + '\r\n' + title + '\r\n' + p_notice + '\r\n' + p_text + '\r\n' + p_tcgg + '\r\n' + p_date
        item['release_date'] = release_date
        item['html'] = html
        item['court_y'] = court_y
        item['start_court_t'] = start_court_t

    def page_total(self, res):
        try:
            doc = pq(res)
            skip = doc('div.turn_page').children('p').children('a')
            tpage = int(skip.eq(skip.length - 2).text())
            return tpage
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
コード例 #10
0
class Spider(MainSpider):

    site_name = '重庆法院公共服务网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://www.cqfygzfw.com/court/gg_listgg.shtml?gg.endDate={end}&gg.startDate={start}' \
                   '&gg.fydm=&gg.ggnr=&page={page}'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page='1'))
        ts = datetime.date.today()
        tm = datetime.date.today() + datetime.timedelta(days=365)
        self.http.http_session(self.url.format(end=str(tm),
                                               start=str(ts),
                                               page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html().replace('&#9658', '')
        log.info('解析重庆法院公共服务网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[
                    'det_url']
                log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)

        log.info('存储天津法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        print(p_total)
        for total in range(2, p_total):
            try:
                log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(end=str(tm),
                                                       start=str(ts),
                                                       page=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html().replace('&#9658', '')
                log.info('解析重庆法院公共服务网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[
                            'det_url']
                        log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)

                log.info('存储重庆法院公共服务网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()

            except Exception:
                m = traceback.format_exc()
                SpiderException(
                    m, self.taskid, self.site_name,
                    self.url.format(end=str(tm),
                                    start=str(ts),
                                    page=str(total)))
        self.mysql_client.session_close()
        log.info('抓取重庆法院公共服务网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        doc = pq(r)
        tb = doc('div.r_wenben table.table_ys tbody')
        trs = tb('tr')
        info_list = list()
        for tr in trs.items():
            item = dict()
            tds = tr('td')
            cy = tds.eq(0).text()
            cn = tds.eq(1).text().strip()
            st = tds.eq(2).text()
            du = tds.eq(1).children('a').attr('onclick').replace(
                'openKtgg(\'', '').replace('\')', '').strip()
            item['court_y'] = cy
            item['court_num'] = cn
            item['start_court_t'] = st
            item['det_url'] = du
            info_list.append(item)
        return info_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        title = doc('div.tc_window_bt').text()
        case_num = doc('td.tc_td01').text()
        content = doc('table.table_ys2 tr').eq(1).children('td').text()
        html = title + '\r\n' + case_num + '\r\n' + content
        item['html'] = html
        item['title'] = title

    def page_total(self, res):
        try:
            str0 = int(
                re.search('共\d*条', res).group().replace('共',
                                                        '').replace('条', ''))
            connt = math.ceil(str0 / 15)
            return connt
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
コード例 #11
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):

    site_name = '河北法院网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://hbgy.hbsfgk.org/ktgg/index.jhtml'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取河北法院网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析河北法院网第{page}页信息'.format(page='1'))
        doc = pq(r)
        skip = doc('div.turn_page').children('p').children('a.sp_next')
        nurl = 'http://hbgy.hbsfgk.org' + skip.attr('href').replace(
            'pagecur=1', 'pagecur={pageno}')
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = p['det_url']
                log.info('开始抓取河北法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析河北法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
        log.info('存储河北法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取河北法院网第{page}页信息'.format(page=str(total)))
                self.http.http_session(nurl.format(pageno=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析河北法院网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = p['det_url']
                        log.info('开始河北法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))

                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析河北法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))

                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)
                log.info('存储河北法院网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)
            time0 = get_today_date()
            time1 = self.get_n_t(r)
            strftime0 = datetime.datetime.strptime(time1, "%Y-%m-%d")
            strftime1 = datetime.datetime.strptime(time0, "%Y-%m-%d")
            fg = strftime1 > strftime0
            if fg == True:
                break

        self.mysql_client.session_close()
        log.info('抓取河北法院网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        doc = pq(r)
        p_list = list()
        sec = doc('ul.sswy_news').children('li')
        for var in sec.items():
            item = dict()
            det_url = var('a').attr('href')
            title = var('a').attr('title')
            start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}',
                                      title).group()
            item['det_url'] = det_url
            item['title'] = title
            item['start_court_t'] = start_court_t
            p_list.append(item)
        return p_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        doc = pq(rs)
        con = doc('div.ywzw_con_inner')
        p1 = con.children().eq(0).text()
        h3 = con.children().eq(1).text()
        p2 = con.children().eq(2).text()
        p3 = con.children().eq(3).text()
        p4 = con.children().eq(4).text()
        p5 = con.children().eq(5).text()
        html = p1 + '\r\n' + h3 + '\r\n' + p2 + '\r\n' + p3 + '\r\n' + p4 + '\r\n' + p5
        item['html'] = html
        item['court_y'] = h3
        item['release_date'] = re.search('\d{4}-\d{2}-\d{2}', p1).group()

    def page_total(self, res):
        try:
            doc = pq(res)
            skip = doc('div.turn_page').children('p').children('a')
            tpage = int(skip.eq(skip.length - 2).text())
            if tpage > 500:
                return 500
            else:
                return tpage
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0

    def get_n_t(self, r):
        doc = pq(r)
        lst = doc('ul.sswy_news').children('li')
        li = lst.eq(lst.length - 1).children('a').attr('title')
        tm = re.search('\d{4}-\d{2}-\d{2}', li).group()
        return tm
コード例 #12
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "gansu"
        self.site_name = "甘肃省高级人民法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "channelId": "307",
            "listsize": "100",
            "pagecur": "0",
            "pagego": "add"
        }

        url = "http://gsgf.gssfgk.com/ktggPage.jspx"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    (form['pagecur'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, (form['pagecur'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            form["listsize"] = total_page
            for i in range(1, int(total_page) + 1):
                try:
                    form["pagecur"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['pagecur'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['pagecur'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['pagecur'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_list = doc('a.zt_02').items()
        total_page = 10
        for page in page_list:
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('div.text ul li a').items()
        object_list = list()
        for x in lis:
            item = dict()
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*法院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text()))
            item["plaintiff"] = "".join(
                re.findall("(审理.*诉)",
                           content("p").text())).replace("审理",
                                                         "").replace("诉", "")
            item["site_name"] = self.site_name
            date = get_today_date()
            if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace(
                    "-", "")) > eval(date):
                # 生成文件路径
                file_out(t_way, str(htm))

                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #13
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "zhejiang"
        self.site_name = "浙江法院公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "pageno": "1",
            "pagesize": "10",
            "cbfy": "全部",
            "dsr": "",
            "spz": "",
            "jarq1": "",
            "jarq2": ""
        }

        url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['pageno'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['pageno'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["pageno"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['pageno'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['pageno'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['pageno'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["list"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["court_y"] = get_content(case.get("FY"))  # 法院
            item["court_t"] = get_content(case.get("FT"))  # 法庭
            item["start_court_t"] = get_content(case.get("KTRQSTRING"))  # 开庭日期
            item["court_num"] = get_content(case.get("AH"))  # 案号
            item["court_case"] = get_content(case.get("AY"))  # 案由
            item["trial_cause"] = get_content(case.get("SPZ")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            item["undertake_dep"] = get_content(case.get("CBBM"))
            item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "")
            item["defendant"] = get_content(case.get("BG")).replace("被告:", "")
            item["schedule_time"] = get_content(case.get("PQRQ"))
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["total"]
            return int(total_page) // 10
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
コード例 #14
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sanxi"
        self.site_name = "陕西法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        page = 1
        url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
            page)
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, page))
        self.http.http_requst(url, "get", headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, page))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            #
            for i in range(2, int(total_page) + 1):
                try:
                    page = i
                    url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
                        page)
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, page))
                    self.http.http_session(url, "get", headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, page))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page = doc('div.paginationControl a').eq(5).attr.href
        total_page = "".join(re.findall("\d{1,3}", page))
        lis = doc('span.left').items()
        object_list = list()
        for x in lis:
            self.http.http_session("http://sxfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()

            doc = pq(htm)
            content = doc('div.detail')
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(content))
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           content('div.sth_a').text()))
            item["title"] = content('div.b_title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "陕西省高级人民法院"
            item["court_t"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["court_part"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #15
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "hubei"
        self.site_name = "湖北省高级人民法院"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"folderNo": "0401", "pageIndex": "1"}

        url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder"
        log.info("开始抓取==============湖北省高级人民法院")
        log.info("开始抓取==============湖北省高级人民法院,第{}页".format(
            str(form['pageIndex'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                str(form['pageIndex'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["pageIndex"] = i
                    log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                            str(form['pageIndex'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i),
                                        self.task_id, url, self.site_name)

                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取湖北省高级人民法院结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html))
        doc = pq(html)
        total_page = "".join(
            re.findall("共.*页\s上",
                       doc('span').text().replace("\n", "")))[1:3]
        lis = doc('table.newlisttable tr').items()
        object_list = list()
        for content in lis:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("(\(.*\))",
                           content('td').text()))[1:-1]
            item["title"] = content('a').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "湖北省高级人民法院" if content(
                'p').text()[:4] == "本院定于" else content('p').text()[:4]
            item["court_t"] = "".join(
                re.findall("(在.*判庭)",
                           content('p').text())).replace("在", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "")
            item["plaintiff"] = "".join(
                re.findall("(原告:.*;)",
                           content('p').text())).replace("原告:", "")
            item["defendant"] = "".join(
                re.findall("(被告:.*的)",
                           content('p').text())).replace("被告:",
                                                         "").replace("的", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #16
0
class Spider(MainSpider):

    site_name = '黑龙江法院网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://www.hljcourt.gov.cn/ktgg/index.php?p={page}&st={start}&et={end}'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1'))
        ts = datetime.date.today()
        tm = datetime.date.today() + datetime.timedelta(days=365)
        self.http.http_session(self.url.format(page='1',
                                               start=str(ts),
                                               end=str(tm)),
                               'get',
                               headers=self.http.headers)
        self.http.set_charset('gb2312')
        r = self.http.parse_html()
        print(r)
        log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1'))
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url']
                log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
            break
        log.info('存储黑龙江法院网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total)))
                self.http.http_session(self.url.format(page=str(total),
                                                       start=str(ts),
                                                       end=str(tm)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[
                            'det_url']
                        log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析黑龙江法院网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)

                log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()

            except Exception:
                m = traceback.format_exc()
                SpiderException(
                    m, self.taskid, self.site_name,
                    self.url.format(end=str(tm),
                                    start=str(ts),
                                    page=str(total)))
        self.mysql_client.session_close()
        log.info('抓取黑龙江法院网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        info_list = list()
        doc = pq(r)
        tb = doc('table tbody').children('tr').children('td')
        k = int(tb.size() / 5)
        for i in range(0, k):
            item = dict()
            title = tb.eq(i * 5 + 1).text()
            court_num = tb.eq(i * 5 + 2).text()
            court_part = tb.eq(i * 5 + 3).text()
            start_court_t = tb.eq(i * 5 + 4).text()
            det_url = tb.eq(i * 5 +
                            1).children('div').children('a').attr('href')
            item['title'] = title
            item['court_num'] = court_num
            item['court_part'] = court_part
            item['start_court_t'] = start_court_t
            item['det_url'] = det_url
            info_list.append(item)
        return info_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        ct = doc('div.ggnr')
        h2 = ct('h2').text()
        h3 = ct('h3').text()
        p = ct('p').text()
        t1 = ct('div.text-01').text()
        t2 = ct('div.text-02').text()
        html = h2 + '\r\n' + h3 + '\r\n' + p + '\r\n' + t1 + '\r\n' + t2
        item['html'] = html
        item['court_y'] = h2

    def page_total(self, res):
        try:
            k = int(
                re.search('共\d*页', res).group().replace('共',
                                                        '').replace('页', ''))
            return k
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
コード例 #17
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "jiangxi"
        self.site_name = "江西庭审公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        date = get_today_date()
        form = {
            'isGeneral': 'Y',
            'belongOrgId': '',
            'liveStatus': '001',
            'page.pageSize': '20',
            'page.pageNo': '1',
            'gopenCourtDate': date + ' 00:00:00',
            'page.orderBy': 'openCourtDate',
            'page.order': 'asc',
            'caseType': '',
            'searchWord': ''
        }

        url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action"
        log.info("开始抓取==============江西庭审公开网")
        log.info("开始抓取==============江西庭审公开网,第{}页".format(
            str(form['page.pageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储==============江西庭审公开网,第{}页".format(
                str(form['page.pageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page.pageNo"] = i
                    log.info("开始抓取==============江西庭审公开网,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储==============江西庭审公开网,第{}页".format(
                            str(form['page.pageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json江西庭审公开网,第{}页异常".format(
                                str(form['page.pageNo'])), self.task_id, url,
                            self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])),
                self.task_id, url, self.site_name)

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["message"]["result"]
        for case in case_list:

            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = case.get("lastBroadcastTimeString")  # 发布日期
            item["title"] = get_content(case.get("caseName"))  # 标题
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("openCourtAddr"))  # 法庭
            item["start_court_t"] = get_content(
                case.get("openCourtDateString"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["case_type"] = get_content(case.get("caseTypeString"))  # 案件类型
            item["court_case"] = get_content(
                case.get("causePlacedOnFile"))  # 案由
            item["trial_cause"] = get_content(
                case.get("underJustice")).strip()  # 审判人员

            try:
                dex = case["litigants"].index("被告:")
                item["plaintiff"] = case["litigants"][:dex].replace(
                    "原告:", "")[:-1]  # 原告
                item["defendant"] = case["litigants"][dex:].replace("被告:",
                                                                    "")  # 被告
            except:
                item["plaintiff"] = ""
                item["defendant"] = case.get("litigants")

            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["message"]["totalPages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
コード例 #18
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "guizhou"
        self.site_name = "贵州法院公众服务平台"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""}

        url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx"
        log.info("开始抓取==============贵州法院公众服务平台")
        log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                str(form['page'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(
                        str(form['page'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                            str(form['page'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取贵州法院公众服务平台结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        for page in doc('a').items():
            if page.text() == "末页":
                total_page = "".join(re.findall("\d{1,3}", page.attr.onclick))

        lis = doc('table.tabData a').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.print-box')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}",
                           content.text())).replace("发表日期:", "")
            item["title"] = x.attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "").replace("时", ":")
            item["court_num"] = "".join(
                re.findall("(审理.*案件)",
                           content('p').text())).replace("审理",
                                                         "").replace("案件", "")
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #19
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "yunan"
        self.site_name = "云南法院司法信息网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"channelId": "858", "listsize": "673", "pagego": "1"}

        url = "http://www.ynfy.gov.cn/ktggPage.jspx"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    (form['pagego'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, (form['pagego'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            form["listsize"] = total_page
            for i in range(2, int(total_page) + 1):
                try:
                    form["pagego"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['pagego'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['pagego'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['pagego'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取{},第{}页异常".format(self.site_name, (form['pagego'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = 10
        for page in doc('div.turn_page a.zt_02').items():
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('ul.sswy_news li').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.ywzw_con_inner')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}",
                           content('p.p_source ').text()))
            item["title"] = x('a').attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3.h3_title').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           x('a').attr.title))
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #20
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):

    site_name = '吉林省高级人民法院司法公开网'

    def __init__(self, taskid):
        MainSpider.__init__(self, task_id=taskid)
        self.http = HttpRequest(taskid, self.site_name)
        self.url = 'http://www.jlsfy.gov.cn/ktgg/index.jhtml'
        self.taskid = taskid

    def parse(self):
        log.info('开始抓取吉林省高级人民法院司法公开网第{page}页信息'.format(page='1'))
        self.http.http_session(self.url.format(page='1'),
                               'get',
                               headers=self.http.headers)
        r = self.http.parse_html()
        log.info('解析吉林省高级人民法院司法公开网第{page}页信息'.format(page='1'))
        doc = pq(r)
        skip = doc('div.turn_page').children('p').children('a.sp_next')
        nurl = 'http://www.jlsfy.gov.cn' + skip.attr('href').replace('&amp;', '&')\
            .replace('pagecur=1', 'pagecur={pageno}')
        p_list = self.parse_list(r)
        b_list = list()
        for p in p_list:
            try:
                d_url = p['det_url']
                log.info('开始抓取吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.http.http_session(d_url, 'get', headers=self.http.headers)
                det_mess = self.http.parse_html()
                log.info('解析吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                    page='1', strip=str(p_list.index(p) + 1)))
                self.parse_info(det_mess, p)
                t_way = self.taskid + str(time.time()) + '.txt'
                file_out(t_way, p['html'])
                p['bulletin_way'] = t_way
                p.pop('det_url')
                p.pop('html')
                p['taskid'] = self.taskid
                b = BulletinCourt(**p)
                b_list.append(b)
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, d_url)
        log.info('存储吉林省高级人民法院司法公开网第{page}页数据'.format(page='1'))
        self.mysql_client.session_insert_list(b_list)
        self.mysql_client.session_commit()
        p_total = self.page_total(r)
        for total in range(2, p_total):
            try:
                log.info('开始抓取林省高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                self.http.http_session(nurl.format(pageno=str(total)),
                                       'get',
                                       headers=self.http.headers)
                r = self.http.parse_html()
                log.info('解析林省高级人民法院司法公开网第{page}页信息'.format(page=str(total)))
                p_list = self.parse_list(r)
                b_list = list()
                for p in p_list:
                    try:
                        d_url = p['det_url']
                        log.info('开始林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.http.http_session(d_url,
                                               'get',
                                               headers=self.http.headers)
                        det_mess = self.http.parse_html()
                        log.info('解析林省高级人民法院司法公开网第{page},第{strip}条信息'.format(
                            page=str(total), strip=str(p_list.index(p) + 1)))
                        self.parse_info(det_mess, p)
                        t_way = self.taskid + str(time.time()) + '.txt'
                        file_out(t_way, p['html'])
                        p['bulletin_way'] = t_way
                        p.pop('det_url')
                        p.pop('html')
                        p['taskid'] = self.taskid
                        b = BulletinCourt(**p)
                        b_list.append(b)
                    except Exception:
                        m = traceback.format_exc()
                        SpiderException(m, self.taskid, self.site_name, d_url)
                log.info('存储林省高级人民法院司法公开网第{page}页数据'.format(page=str(total)))
                self.mysql_client.session_insert_list(b_list)
                self.mysql_client.session_commit()
            except Exception:
                m = traceback.format_exc()
                SpiderException(m, self.taskid, self.site_name, self.url)

        self.mysql_client.session_close()
        log.info('抓取林省高级人民法院司法公开网结束')

    def added_parse(self):
        pass

    def parse_list(self, r):
        p_list = list()
        doc = pq(r)
        sec = doc('ul.organList').children('li')
        for var in sec.items():
            item = dict()
            det_url = var('a').attr('href')
            title = var('a').attr('title')
            start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}',
                                      title).group()
            release_date = var('span').text()
            item['det_url'] = det_url
            item['title'] = title
            item['start_court_t'] = start_court_t
            item['release_date'] = release_date
            p_list.append(item)
        return p_list

    def parse_info(self, rs, item):
        doc = pq(rs)
        con = doc('div.ggnr')
        h2 = con('h2').text()
        h3 = con('h3').text()
        p = con('p').text()
        t1 = con('div.text-01').text()
        t2 = con('div.text-02').text()
        html = h2 + '\r\n' + h3 + '\r\n' + p + '\r\n' + t1 + '\r\n' + t2
        item['html'] = html
        item['court_y'] = h2

    def page_total(self, res):
        try:
            doc = pq(res)
            skip = doc('div.turn_page').children('p').children('a')
            tpage = int(skip.eq(skip.length - 2).text())
            return tpage
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.taskid, self.site_name, '解析总页数异常')
            return 0
コード例 #21
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "hainan"
        self.site_name = "天涯法律网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        today_date = get_today_date()
        next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:]

        form = {
            "currentPageNo": "1",
            "pageSize": "10",
            "startDate": today_date,
            "endDate": next_year_today_date,
            "caseNo": "",
            "litigant": "",
            "judge": "",
            "caseDesc": "",
            "siteId": "f7afc746-8577-4cd4-a410-884027df5bab"
        }

        url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["currentPageNo"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, i))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo'])
                                                                 ), self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo'])
                                                     ), self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = get_content(case.get("createDate"))
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("trialCourt"))  # 法庭
            item["start_court_t"] = get_content(case.get("courtTime"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["court_case"] = get_content(case.get("caseDesc"))  # 案由
            item["trial_cause"] = get_content(case.get("judge")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["pages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0