コード例 #1
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "shanxi"
        self.site_name = "山西法院诉讼服务网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        form = {"channelId": "307", "listsize": "238", "pagego": "1"}

        url = "http://www.shanxify.gov.cn/ktggPage.jspx"
        log.info("开始抓取==============山西法院诉讼服务网")
        log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(str(
            form['pagego'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============山西法院诉讼服务网,第{}页".format(
                str(form['pagego'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["pagego"] = i
                    log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info(
                            "开始存储==============抓取山西法院诉讼服务网,第{}页".format(i))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(i),
                                        self.task_id, url, self.site_name)
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break

        else:
            SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(str(form['pagego'])),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取山西法院诉讼服务网结束")

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = int(doc('a.zt_02').text()[-3:])
        lis = doc('div.text ul li a').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            self.http.http_session(x.attr.href, "post", headers=self.headers)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = x.text()[:16]
            if u"刑事" in item["title"]:
                item["defendant"] = "".join(
                    re.findall("(审理.*)",
                               content('p').text().replace("\xa0\xa0",
                                                           ""))).replace(
                                                               "审理", "")
            else:
                item["plaintiff"] = "".join(
                    re.findall("(审理.*诉)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("审理", "").replace("诉", "")
                item["defendant"] = "".join(
                    re.findall("(诉.*等)",
                               content('p').text().replace(
                                   "\xa0\xa0",
                                   ""))).replace("诉", "").replace("等", "")
            item['site_name'] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
            # 返回对象列表和总页数
        return object_list, total_page
コード例 #2
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sanxi"
        self.site_name = "陕西法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        page = 1
        url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
            page)
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, page))
        self.http.http_requst(url, "get", headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, page))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            #
            for i in range(2, int(total_page) + 1):
                try:
                    page = i
                    url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
                        page)
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, page))
                    self.http.http_session(url, "get", headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, page))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page = doc('div.paginationControl a').eq(5).attr.href
        total_page = "".join(re.findall("\d{1,3}", page))
        lis = doc('span.left').items()
        object_list = list()
        for x in lis:
            self.http.http_session("http://sxfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()

            doc = pq(htm)
            content = doc('div.detail')
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(content))
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           content('div.sth_a').text()))
            item["title"] = content('div.b_title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "陕西省高级人民法院"
            item["court_t"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["court_part"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #3
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "guizhou"
        self.site_name = "贵州法院公众服务平台"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""}

        url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx"
        log.info("开始抓取==============贵州法院公众服务平台")
        log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                str(form['page'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(
                        str(form['page'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                            str(form['page'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取贵州法院公众服务平台结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        for page in doc('a').items():
            if page.text() == "末页":
                total_page = "".join(re.findall("\d{1,3}", page.attr.onclick))

        lis = doc('table.tabData a').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.print-box')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}",
                           content.text())).replace("发表日期:", "")
            item["title"] = x.attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "").replace("时", ":")
            item["court_num"] = "".join(
                re.findall("(审理.*案件)",
                           content('p').text())).replace("审理",
                                                         "").replace("案件", "")
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #4
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sichuan"
        self.site_name = "四川法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""}

        url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['page'])))
        self.http.set_charset("unicode")
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['page'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['page'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['page'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                # break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['page'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['page'])))

        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            if "开庭公告" in html.unescape(case["ggbt"]):
                item = dict()
                item["release_date"] = case["clsj"]
                formdata = {
                    "ggsdid": "{}".format(str(case['ggsdid'])),
                    "ssfy": "{}".format(str(case['fydm']))
                }
                ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do"
                self.http.http_session(ur,
                                       "post",
                                       data=formdata,
                                       headers=self.headers)
                json = self.http.parse_json()["data"]
                item["taskid"] = self.task_id
                item["release_date"] = html.unescape(json.get("CLSJ"))
                item["title"] = html.unescape(json.get("GGBT"))
                item["court_y"] = get_content(json.get("SSFYMC"))  # 法院
                content = html.unescape(json.get("GGNR"))
                t_way = self.task_id + str(time.time()) + '.txt'
                file_out(t_way, str(content))
                item["court_t"] = "".join(re.findall("法院.{1,10}庭",
                                                     content)).replace(
                                                         "法院", "")
                item["court_num"] = html.unescape(json.get("AH"))  # 案号
                item["trial_cause"] = html.unescape(
                    json.get("CBRXM").strip())  # 审判人员
                item['bulletin_way'] = t_way
                item["site_name"] = self.site_name
                b = BulletinCourt(**item)
                object_list.append(b)
        return object_list
コード例 #5
0
ファイル: spiderr.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "qinghai"
        self.site_name = "青海法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers
        self.http.set_charset("gbk")

    def parse(self):

        form = {
            "p": "1",
            "LocationID": "0700000000",
            "sub": ""
        }

        url = "http://qhfy.chinacourt.org/fygg/index.php"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page)+1):
                try:
                    form["p"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取{},第{}页异常".format(self.site_name, (
                            form['p'])), self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        # print(doc("td.td_pagebar").text())
        total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace(
            "共", "").replace("页", "").strip()
        lis = doc('td.td_line').items()
        object_list = list()
        for x in lis:
            if "开庭" in x.text():
                self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers)
                htm = self.http.parse_html()
                doc = pq(htm)
                content = doc
                item = dict()
                item["taskid"] = self.task_id
                item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text()))
                item["title"] = x.text()
                t_way = self.task_id + str(time.time()) + '.txt'
                item["bulletin_way"] = t_way
                item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text()))
                item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title))
                item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                item["site_name"] = self.site_name
                # print(item)
                if eval(item["release_date"].replace("-", "")) > eval("20180101"):
                    file_out(t_way, str(htm))
                    # 将item字典映射成对象
                    b = BulletinCourt(**item)
                    object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #6
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "hunan"
        self.site_name = "湖南法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        page = 1
        url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format(
            page)
        log.info("开始抓取==============湖南法院网")
        log.info("开始抓取==============湖南法院网,第{}页".format(page))
        self.http.http_session(url, "get", headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============山西法院诉讼服务网,第{}页".format(page))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                page = i
                try:
                    log.info("开始抓取==============湖南法院网,第{}页".format(page))
                    url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format(
                        page)
                    self.http.http_session(url, "get", headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info(
                            "开始存储==============山西法院诉讼服务网,第{}页".format(page))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取湖南法院网,第{}页异常".format(page),
                                        self.task_id, url, self.site_name)
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break

        else:
            SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url,
                            self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取湖南法院网结束")

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_lis = doc('a').items()
        for pag in page_lis:
            if pag.text() == "尾页":
                total_page = "".join(re.findall("(\d*.shtml)",
                                                pag.attr.href)).replace(
                                                    ".shtml", "")
        lis = doc('div.font14 li').items()
        # 创建对象列表
        object_list = list()
        for x in lis:
            # 创建item字典
            item = dict()
            item["release_date"] = x('span.right').text()
            self.http.http_session("http://hunanfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 将获取的html写入文件
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.detail')
            item["taskid"] = self.task_id
            item["title"] = content('div.detail_bigtitle').text()
            item["court_y"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("刑.*庭",
                           content('div.detail_txt').text()))
            item["start_court_t"] = "".join(
                re.findall("本院定于\d{4}年.{1,5}日",
                           content('div.detail_txt').text())).replace(
                               "年", "-").replace("月",
                                                 "-").replace("日", "").replace(
                                                     "本院定于", "")
            item["court_num"] = "".join(
                re.findall("审理.*号",
                           content('div.detail_txt').text())).replace(
                               "审理", "")
            item["trial_cause"] = "".join(
                re.findall("合议庭成员.*\s",
                           content('div.detail_txt').text())).replace(
                               "合议庭成员:", "").replace("\n", "")
            item["court_part"] = "".join(
                re.findall("在.*法院",
                           content('div.detail_txt').text())).replace("在", "")
            item['site_name'] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #7
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "guangdong"
        self.site_name = "广东法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "action": "gotoggxxcx",
            "gglx": "ktgg",
            "flag": "first"
        }

        url = "http://www.gdcourts.gov.cn/web/search"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['flag'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['flag'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name)

        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html.encode("utf8")))

        doc = pq(html)
        lis = doc('div.doclist tr').items()
        object_list = list()
        x_lis = list()
        for x in lis:
            x_lis.append(x)
        text_lis = list()
        for i in x_lis[1:]:
            text_lis = list()
            for text in i('td').items():
                text_lis.append(text.text())
            item = dict()
            item["taskid"] = self.task_id
            item["bulletin_way"] = t_way
            item["court_num"] = text_lis[0]
            item["court_pur"] = text_lis[1]
            item["court_part"] = text_lis[2]
            item["start_court_t"] = text_lis[3]
            item["court_end_t"] = text_lis[4]
            item["court_status"] = text_lis[5]
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # # 返回对象列表和总页数
        return object_list
コード例 #8
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "jiangxi"
        self.site_name = "江西庭审公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        date = get_today_date()
        form = {
            'isGeneral': 'Y',
            'belongOrgId': '',
            'liveStatus': '001',
            'page.pageSize': '20',
            'page.pageNo': '1',
            'gopenCourtDate': date + ' 00:00:00',
            'page.orderBy': 'openCourtDate',
            'page.order': 'asc',
            'caseType': '',
            'searchWord': ''
        }

        url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action"
        log.info("开始抓取==============江西庭审公开网")
        log.info("开始抓取==============江西庭审公开网,第{}页".format(
            str(form['page.pageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)

        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储==============江西庭审公开网,第{}页".format(
                str(form['page.pageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["page.pageNo"] = i
                    log.info("开始抓取==============江西庭审公开网,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储==============江西庭审公开网,第{}页".format(
                            str(form['page.pageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json江西庭审公开网,第{}页异常".format(
                                str(form['page.pageNo'])), self.task_id, url,
                            self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])),
                self.task_id, url, self.site_name)

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["message"]["result"]
        for case in case_list:

            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = case.get("lastBroadcastTimeString")  # 发布日期
            item["title"] = get_content(case.get("caseName"))  # 标题
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("openCourtAddr"))  # 法庭
            item["start_court_t"] = get_content(
                case.get("openCourtDateString"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["case_type"] = get_content(case.get("caseTypeString"))  # 案件类型
            item["court_case"] = get_content(
                case.get("causePlacedOnFile"))  # 案由
            item["trial_cause"] = get_content(
                case.get("underJustice")).strip()  # 审判人员

            try:
                dex = case["litigants"].index("被告:")
                item["plaintiff"] = case["litigants"][:dex].replace(
                    "原告:", "")[:-1]  # 原告
                item["defendant"] = case["litigants"][dex:].replace("被告:",
                                                                    "")  # 被告
            except:
                item["plaintiff"] = ""
                item["defendant"] = case.get("litigants")

            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["message"]["totalPages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
コード例 #9
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "yunan"
        self.site_name = "云南法院司法信息网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"channelId": "858", "listsize": "673", "pagego": "1"}

        url = "http://www.ynfy.gov.cn/ktggPage.jspx"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    (form['pagego'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, (form['pagego'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            form["listsize"] = total_page
            for i in range(2, int(total_page) + 1):
                try:
                    form["pagego"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['pagego'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['pagego'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['pagego'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取{},第{}页异常".format(self.site_name, (form['pagego'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = 10
        for page in doc('div.turn_page a.zt_02').items():
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('ul.sswy_news li').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.ywzw_con_inner')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}",
                           content('p.p_source ').text()))
            item["title"] = x('a').attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3.h3_title').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           x('a').attr.title))
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #10
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "hubei"
        self.site_name = "湖北省高级人民法院"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"folderNo": "0401", "pageIndex": "1"}

        url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder"
        log.info("开始抓取==============湖北省高级人民法院")
        log.info("开始抓取==============湖北省高级人民法院,第{}页".format(
            str(form['pageIndex'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                str(form['pageIndex'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["pageIndex"] = i
                    log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                            str(form['pageIndex'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i),
                                        self.task_id, url, self.site_name)

                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取湖北省高级人民法院结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html))
        doc = pq(html)
        total_page = "".join(
            re.findall("共.*页\s上",
                       doc('span').text().replace("\n", "")))[1:3]
        lis = doc('table.newlisttable tr').items()
        object_list = list()
        for content in lis:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("(\(.*\))",
                           content('td').text()))[1:-1]
            item["title"] = content('a').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "湖北省高级人民法院" if content(
                'p').text()[:4] == "本院定于" else content('p').text()[:4]
            item["court_t"] = "".join(
                re.findall("(在.*判庭)",
                           content('p').text())).replace("在", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "")
            item["plaintiff"] = "".join(
                re.findall("(原告:.*;)",
                           content('p').text())).replace("原告:", "")
            item["defendant"] = "".join(
                re.findall("(被告:.*的)",
                           content('p').text())).replace("被告:",
                                                         "").replace("的", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #11
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "gansu"
        self.site_name = "甘肃省高级人民法院司法公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "channelId": "307",
            "listsize": "100",
            "pagecur": "0",
            "pagego": "add"
        }

        url = "http://gsgf.gssfgk.com/ktggPage.jspx"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    (form['pagecur'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, (form['pagecur'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            form["listsize"] = total_page
            for i in range(1, int(total_page) + 1):
                try:
                    form["pagecur"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['pagecur'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['pagecur'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['pagecur'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page_list = doc('a.zt_02').items()
        total_page = 10
        for page in page_list:
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('div.text ul li a').items()
        object_list = list()
        for x in lis:
            item = dict()
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.text')
            item["taskid"] = self.task_id
            item["release_date"] = content('h2').text()[3:13]
            item["title"] = content('h1').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "".join(
                re.findall("(在.*法院)",
                           content('h1').text())).replace("在", "")
            item["court_t"] = "".join(
                re.findall("(院.*庭)",
                           content('h1').text())).replace("院", "").replace(
                               "开庭", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text()))
            item["plaintiff"] = "".join(
                re.findall("(审理.*诉)",
                           content("p").text())).replace("审理",
                                                         "").replace("诉", "")
            item["site_name"] = self.site_name
            date = get_today_date()
            if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace(
                    "-", "")) > eval(date):
                # 生成文件路径
                file_out(t_way, str(htm))

                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #12
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "zhejiang"
        self.site_name = "浙江法院公开网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {
            "pageno": "1",
            "pagesize": "10",
            "cbfy": "全部",
            "dsr": "",
            "spz": "",
            "jarq1": "",
            "jarq2": ""
        }

        url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    str(form['pageno'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, str(form['pageno'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["pageno"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, str(form['pageno'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取json{},第{}页异常".format(self.site_name,
                                                     str(form['pageno'])),
                            self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)

                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉

                break
            self.mysql_client.session_close()
        else:
            SpiderException(
                "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])),
                self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['pageno'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["list"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["court_y"] = get_content(case.get("FY"))  # 法院
            item["court_t"] = get_content(case.get("FT"))  # 法庭
            item["start_court_t"] = get_content(case.get("KTRQSTRING"))  # 开庭日期
            item["court_num"] = get_content(case.get("AH"))  # 案号
            item["court_case"] = get_content(case.get("AY"))  # 案由
            item["trial_cause"] = get_content(case.get("SPZ")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            item["undertake_dep"] = get_content(case.get("CBBM"))
            item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "")
            item["defendant"] = get_content(case.get("BG")).replace("被告:", "")
            item["schedule_time"] = get_content(case.get("PQRQ"))
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["total"]
            return int(total_page) // 10
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0
コード例 #13
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "fujian"
        self.site_name = "福建省高级人民法院法院公告"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        url = "https://www.fjcourt.gov.cn/page/public/courtreport.html"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, 1))
        self.http.http_requst(url, "get", headers=self.headers, verify=False)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page, VIEWSTATE = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, 1))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()

            for i in range(2, int(total_page) + 1):
                form = {
                    "__VIEWSTATE": VIEWSTATE,
                    "__VIEWSTATEGENERATOR": "54969BDC",
                    "__EVENTTARGET": "ctl00$cplContent$AspNetPager1",
                }
                try:
                    form["__EVENTARGUMENT"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['__EVENTARGUMENT'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page, VIEWSTATE = self.parse_html(
                            html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['__EVENTARGUMENT'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['__EVENTARGUMENT'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, 1),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):

        doc = pq(html)
        total_page = 10
        for page in doc('a.pagination').items():
            if page.text() == ">>":
                total_page = int("".join(re.findall("\d{2,3}",
                                                    page.attr.href)))
        VIEWSTATE = doc("div.aspNetHidden input").attr.value
        lis = doc('ul.module-case-items li').items()
        object_list = list()
        for x in lis:
            self.http.http_session("https://www.fjcourt.gov.cn" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers,
                                   verify=False)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            content = doc('div.article-wrap')
            item = dict()
            item["taskid"] = self.task_id
            item["title"] = content('p.article-hd-title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = content('span.article-author').text()
            item["court_t"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["start_court_t"] = x('span.cir-time').text().replace(
                "[", "").replace("]", "")
            item["court_part"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["site_name"] = self.site_name
            pub_time = (item["start_court_t"].replace("-", ""))
            date = get_today_date()
            if eval(pub_time) > eval(date):
                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page, VIEWSTATE
コード例 #14
0
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "hainan"
        self.site_name = "天涯法律网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        today_date = get_today_date()
        next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:]

        form = {
            "currentPageNo": "1",
            "pageSize": "10",
            "startDate": today_date,
            "endDate": next_year_today_date,
            "caseNo": "",
            "litigant": "",
            "judge": "",
            "caseDesc": "",
            "siteId": "f7afc746-8577-4cd4-a410-884027df5bab"
        }

        url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
        self.http.http_session(url, "post", data=form, headers=self.headers)
        #
        if self.http.res_code() == 200:
            json_data = self.http.parse_json()
            object_list = self.parse_list(json_data, form)
            log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
            self.mysql_client.session_insert_list(object_list)
            self.mysql_client.session_commit()
            total_page = self.get_total_page(json_data)
            for i in range(2, total_page + 1):
                try:
                    form["currentPageNo"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, i))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        json_data = self.http.parse_json()
                        object_list = self.parse_list(json_data, form)
                        log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo'])))
                        self.mysql_client.session_insert_list(object_list)
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo'])
                                                                 ), self.task_id, url, self.site_name)

                except Exception:
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
            self.mysql_client.session_close()
        else:
            SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo'])
                                                     ), self.task_id, url, self.site_name)
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_list(self, json_data, form):
        # 解析获取到的json
        log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo'])))
        t_way = self.task_id + str(time.time()) + '.txt'
        file_out(t_way, str(json_data))
        object_list = list()
        case_list = json_data["data"]
        for case in case_list:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = get_content(case.get("createDate"))
            item["court_y"] = get_content(case.get("belongOrgName"))  # 法院
            item["court_t"] = get_content(case.get("trialCourt"))  # 法庭
            item["start_court_t"] = get_content(case.get("courtTime"))  # 开庭日期
            item["court_num"] = get_content(case.get("caseNo"))  # 案号
            item["court_case"] = get_content(case.get("caseDesc"))  # 案由
            item["trial_cause"] = get_content(case.get("judge")).strip()  # 审判人员
            item["site_name"] = self.site_name  # 网站名称
            item['bulletin_way'] = t_way
            b = BulletinCourt(**item)
            object_list.append(b)
        return object_list

    def get_total_page(self, json_data):
        # 获取总页数
        try:
            total_page = json_data["pages"]
            return int(total_page)
        except Exception:
            m = traceback.format_exc()
            SpiderException(m, self.task_id, self.site_name, json_data)
            return 0