コード例 #1
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "guizhou"
        self.site_name = "贵州法院公众服务平台"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""}

        url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx"
        log.info("开始抓取==============贵州法院公众服务平台")
        log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                str(form['page'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["page"] = i
                    log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(
                        str(form['page'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============贵州法院公众服务平台,第{}页".format(
                            str(form['page'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取贵州法院公众服务平台结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        for page in doc('a').items():
            if page.text() == "末页":
                total_page = "".join(re.findall("\d{1,3}", page.attr.onclick))

        lis = doc('table.tabData a').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x.attr.href, "get", headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.print-box')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}",
                           content.text())).replace("发表日期:", "")
            item["title"] = x.attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "").replace("时", ":")
            item["court_num"] = "".join(
                re.findall("(审理.*案件)",
                           content('p').text())).replace("审理",
                                                         "").replace("案件", "")
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #2
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "sanxi"
        self.site_name = "陕西法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):
        page = 1
        url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
            page)
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, page))
        self.http.http_requst(url, "get", headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, page))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            #
            for i in range(2, int(total_page) + 1):
                try:
                    page = i
                    url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format(
                        page)
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, page))
                    self.http.http_session(url, "get", headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, page))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, page),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        page = doc('div.paginationControl a').eq(5).attr.href
        total_page = "".join(re.findall("\d{1,3}", page))
        lis = doc('span.left').items()
        object_list = list()
        for x in lis:
            self.http.http_session("http://sxfy.chinacourt.org" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()

            doc = pq(htm)
            content = doc('div.detail')
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(content))
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           content('div.sth_a').text()))
            item["title"] = content('div.b_title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "陕西省高级人民法院"
            item["court_t"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["court_part"] = "".join(
                re.findall("(在.{1,10}公开)",
                           content('div').text())).replace("在", "").replace(
                               "公开", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #3
0
ファイル: spiderr.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):

    def __init__(self):
        self.task_id = "qinghai"
        self.site_name = "青海法院网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers
        self.http.set_charset("gbk")

    def parse(self):

        form = {
            "p": "1",
            "LocationID": "0700000000",
            "sub": ""
        }

        url = "http://qhfy.chinacourt.org/fygg/index.php"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page)+1):
                try:
                    form["p"] = i
                    log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p'])))
                    self.http.http_session(url, "post", data=form, headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(self.site_name, (form['p'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取{},第{}页异常".format(self.site_name, (
                            form['p'])), self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))


    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        # print(doc("td.td_pagebar").text())
        total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace(
            "共", "").replace("页", "").strip()
        lis = doc('td.td_line').items()
        object_list = list()
        for x in lis:
            if "开庭" in x.text():
                self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers)
                htm = self.http.parse_html()
                doc = pq(htm)
                content = doc
                item = dict()
                item["taskid"] = self.task_id
                item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text()))
                item["title"] = x.text()
                t_way = self.task_id + str(time.time()) + '.txt'
                item["bulletin_way"] = t_way
                item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text()))
                item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title))
                item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text())
                                          ).replace("在", "").replace("公开", "").replace("依法", "")
                item["site_name"] = self.site_name
                # print(item)
                if eval(item["release_date"].replace("-", "")) > eval("20180101"):
                    file_out(t_way, str(htm))
                    # 将item字典映射成对象
                    b = BulletinCourt(**item)
                    object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #4
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "yunan"
        self.site_name = "云南法院司法信息网"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"channelId": "858", "listsize": "673", "pagego": "1"}

        url = "http://www.ynfy.gov.cn/ktggPage.jspx"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name,
                                                    (form['pagego'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(
                self.site_name, (form['pagego'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            form["listsize"] = total_page
            for i in range(2, int(total_page) + 1):
                try:
                    form["pagego"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['pagego'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['pagego'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['pagego'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取{},第{}页异常".format(self.site_name, (form['pagego'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        doc = pq(html)
        total_page = 10
        for page in doc('div.turn_page a.zt_02').items():
            if int(page.text()) > total_page:
                total_page = int(page.text())
        lis = doc('ul.sswy_news li').items()
        object_list = list()
        for x in lis:
            self.http.http_session(x('a').attr.href,
                                   "get",
                                   headers=self.headers)
            htm = self.http.parse_html()
            # # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            doc = pq(htm)
            content = doc('div.ywzw_con_inner')
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}",
                           content('p.p_source ').text()))
            item["title"] = x('a').attr.title
            item["bulletin_way"] = t_way
            item["court_y"] = content('h3.h3_title').text()
            item["court_t"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["start_court_t"] = "".join(
                re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}",
                           x('a').attr.title))
            item["court_part"] = "".join(
                re.findall("(在.*依法)",
                           content('p').text())).replace("在",
                                                         "").replace("依法", "")
            item["site_name"] = self.site_name

            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        #     break
        return object_list, total_page
コード例 #5
0
ファイル: spider.py プロジェクト: cnb2cd/Spider_app
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "hubei"
        self.site_name = "湖北省高级人民法院"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        form = {"folderNo": "0401", "pageIndex": "1"}

        url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder"
        log.info("开始抓取==============湖北省高级人民法院")
        log.info("开始抓取==============湖北省高级人民法院,第{}页".format(
            str(form['pageIndex'])))
        self.http.http_requst(url, "post", data=form, headers=self.headers)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page = self.parse_html(html_data)
            log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                str(form['pageIndex'])))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()
            for i in range(2, int(total_page) + 1):
                try:
                    form["pageIndex"] = i
                    log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page = self.parse_html(html_data)
                        log.info("开始存储==============湖北省高级人民法院,第{}页".format(
                            str(form['pageIndex'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i),
                                        self.task_id, url, self.site_name)

                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException(
                "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])),
                self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取湖北省高级人民法院结束")

    def added_parse(self):
        pass

    def parse_html(self, html):
        # 解析html

        # 生成文件路径
        t_way = self.task_id + str(time.time()) + '.txt'
        # 生成文件路径
        file_out(t_way, str(html))
        doc = pq(html)
        total_page = "".join(
            re.findall("共.*页\s上",
                       doc('span').text().replace("\n", "")))[1:3]
        lis = doc('table.newlisttable tr').items()
        object_list = list()
        for content in lis:
            item = dict()
            item["taskid"] = self.task_id
            item["release_date"] = "".join(
                re.findall("(\(.*\))",
                           content('td').text()))[1:-1]
            item["title"] = content('a').text()
            item["bulletin_way"] = t_way
            item["court_y"] = "湖北省高级人民法院" if content(
                'p').text()[:4] == "本院定于" else content('p').text()[:4]
            item["court_t"] = "".join(
                re.findall("(在.*判庭)",
                           content('p').text())).replace("在", "")
            item["start_court_t"] = "".join(
                re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})",
                           content('p').text())).replace("年", "-").replace(
                               "月", "-").replace("日", "")
            item["plaintiff"] = "".join(
                re.findall("(原告:.*;)",
                           content('p').text())).replace("原告:", "")
            item["defendant"] = "".join(
                re.findall("(被告:.*的)",
                           content('p').text())).replace("被告:",
                                                         "").replace("的", "")
            item["site_name"] = self.site_name
            # 将item字典映射成对象
            b = BulletinCourt(**item)
            object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page
コード例 #6
0
class Spider(MainSpider):
    def __init__(self):
        self.task_id = "fujian"
        self.site_name = "福建省高级人民法院法院公告"
        MainSpider.__init__(self, task_id=self.task_id)
        self.http = HttpRequest(self.task_id, self.site_name)
        self.headers = headers

    def parse(self):

        url = "https://www.fjcourt.gov.cn/page/public/courtreport.html"
        log.info("开始抓取=============={}".format(self.site_name))
        log.info("开始抓取=============={},第{}页".format(self.site_name, 1))
        self.http.http_requst(url, "get", headers=self.headers, verify=False)
        if self.http.res_code() == 200:
            html_data = self.http.parse_html()
            object_list, total_page, VIEWSTATE = self.parse_html(html_data)
            log.info("开始存储=============={},第{}页".format(self.site_name, 1))
            # 将对象列表插入数据库
            self.mysql_client.session_insert_list(object_list)
            # 提交
            self.mysql_client.session_commit()

            for i in range(2, int(total_page) + 1):
                form = {
                    "__VIEWSTATE": VIEWSTATE,
                    "__VIEWSTATEGENERATOR": "54969BDC",
                    "__EVENTTARGET": "ctl00$cplContent$AspNetPager1",
                }
                try:
                    form["__EVENTARGUMENT"] = i
                    log.info("开始抓取=============={},第{}页".format(
                        self.site_name, (form['__EVENTARGUMENT'])))
                    self.http.http_session(url,
                                           "post",
                                           data=form,
                                           headers=self.headers)
                    if self.http.res_code() == 200:
                        html_data = self.http.parse_html()
                        object_list, total_page, VIEWSTATE = self.parse_html(
                            html_data)
                        log.info("开始存储=============={},第{}页".format(
                            self.site_name, (form['__EVENTARGUMENT'])))
                        # 将对象列表插入数据库
                        self.mysql_client.session_insert_list(object_list)
                        # 提交
                        self.mysql_client.session_commit()
                    else:
                        SpiderException(
                            "抓取{},第{}页异常".format(self.site_name,
                                                 (form['__EVENTARGUMENT'])),
                            self.task_id, url, self.site_name)
            #
                except Exception:
                    # 捕获异常
                    m = traceback.format_exc()
                    SpiderException(m, self.task_id, url, self.site_name)
                # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉
                break
        else:
            SpiderException("抓取{},第{}页异常".format(self.site_name, 1),
                            self.task_id, url, self.site_name)
        # 关闭数据库链接
        self.mysql_client.session_close()
        log.info("抓取{}结束".format(self.site_name))

    def added_parse(self):
        pass

    def parse_html(self, html):

        doc = pq(html)
        total_page = 10
        for page in doc('a.pagination').items():
            if page.text() == ">>":
                total_page = int("".join(re.findall("\d{2,3}",
                                                    page.attr.href)))
        VIEWSTATE = doc("div.aspNetHidden input").attr.value
        lis = doc('ul.module-case-items li').items()
        object_list = list()
        for x in lis:
            self.http.http_session("https://www.fjcourt.gov.cn" +
                                   x('a').attr.href,
                                   "get",
                                   headers=self.headers,
                                   verify=False)
            htm = self.http.parse_html()
            doc = pq(htm)
            # 生成文件路径
            t_way = self.task_id + str(time.time()) + '.txt'
            # 生成文件路径
            file_out(t_way, str(htm))
            content = doc('div.article-wrap')
            item = dict()
            item["taskid"] = self.task_id
            item["title"] = content('p.article-hd-title').text()
            item["bulletin_way"] = t_way
            item["court_y"] = content('span.article-author').text()
            item["court_t"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["start_court_t"] = x('span.cir-time').text().replace(
                "[", "").replace("]", "")
            item["court_part"] = "".join(
                re.findall("(在.*公开)",
                           content('div.article-content').text())).replace(
                               "在", "").replace("公开", "")
            item["site_name"] = self.site_name
            pub_time = (item["start_court_t"].replace("-", ""))
            date = get_today_date()
            if eval(pub_time) > eval(date):
                # 将item字典映射成对象
                b = BulletinCourt(**item)
                object_list.append(b)
        # 返回对象列表和总页数
        return object_list, total_page, VIEWSTATE