Beispiel #1
0
class moNews(scrapy.Spider):
    name = "henanSpider"
    start_url = "http://www.henan.gov.cn/ywdt/hnyw/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.henan.gov.cn/ywdt/hnyw/",
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            "//div[@class='main']//li/a/@href").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 3))
            print(detail_url)
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//*[@id='title']//text()""").extract()
                title = "".join(titles).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                authors = response.xpath(
                    """//*[@id='source']//text()""").extract()
                author = "".join(authors).strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            source = "http://www.henan.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//*[@class='content']//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #2
0
class xizangNews(scrapy.Spider):
    name = "xizangSpider"
    start_url = [
        "http://www.xizang.gov.cn/xwzx/zwyw/",
        "http://www.xizang.gov.cn/xwzx/jjjs/",
        "http://www.xizang.gov.cn/xwzx/qnyw/",
        "http://www.xizang.gov.cn/xwzx/dsyw/",
        "http://www.xizang.gov.cn/xwzx/xqxw/",
        "http://www.xizang.gov.cn/xwzx/xwrp/",
        "http://www.xizang.gov.cn/xwzx/dwjl/",
        "http://www.xizang.gov.cn/xwzx/shfz/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(4,5))
            yield scrapy.Request(url=url, callback=self.parse_item_page_home)

    def parse_item_page_home(self, response):
        # text = response.text
        # max_page = text.split("createPageHTML(")[2].split(",")[0]
        # for page in range(1, int(max_page) + 1):
        # for page in range(1, 2):
        url = response.url + "index" + ".html"
        # time.sleep(random.uniform(4,5))
        yield scrapy.Request(url=url, callback=self.parse_item_page_list)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//div[@class='zx-wdsyw-con']/ul/li/a/@href").extract()
        for news_url in news_list:
            if news_url.startswith("./"):
                news_url = response.url.split("/index")[0] + news_url[1:]
                # time.sleep(random.uniform(4,5))
                yield scrapy.Request(url=news_url, callback=self.parse)
            else:
                # time.sleep(random.uniform(4,5))
                yield scrapy.Request(url=news_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("//div[@class='xz-xl-tit']/h3/text()"
                                       ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath(
                    "//div[@class='xz-xl-info']/p/span/text()").extract()
                if len(author) == 2:
                    author = author[1]
                else:
                    author = "西藏自治区人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='xz-xl-article']"
                                             )[0].xpath("string(.)").extract()
                content = "".join(content_arr).split(
                    "//显示下载附件")[0][:-106].strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xizang.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #3
0
class moNews(scrapy.Spider):
    name = "shanxiSpider"
    start_url = "http://www.shanxi.gov.cn/yw/sxyw/index.shtml"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.shanxi.gov.cn/yw/sxyw/index.shtml",
            callback=self.parse_item_page_list,
            headers=self.header)
        # # 全量数据
        # for i in range(1,1292):
        #     url = "http://www.shanxi.gov.cn/yw/sxyw/index_"+str(i)+".shtml"
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            "/html/body/div/div/div/div/div/ul/li/a/@href").extract()
        for detail_url in detail_urls:
            detail_url = "http://www.shanxi.gov.cn/yw/sxyw/" + detail_url.replace(
                "./", "")
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    "//div[starts-with(@class,'detail-article-title')]//text()"
                ).extract()
                title = "".join(content_title).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                content_author = response.xpath(
                    """/html/body/div/div/div/div/ul/li/span[2]/text()"""
                ).extract()
                author = "".join(content_author).strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """/html/body/div/div/div/div/ul/li/span[1]/text()"""
                ).extract()
                public_time = str(content_time[0].strip()) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.shanxi.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #4
0
class heilongjiangNews(scrapy.Spider):
    name = "jiangsuSpider"
    start_url = "http://www.js.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.js.gov.cn/col/col60096/index.html",
            callback=self.parse_item_page_list,
            headers=self.header)
        # 全量爬取历史数据
        # for i in range(2, 69):
        #     url = "http://www.js.gov.cn/col/col60096/index.html?uid=212860&pageNum=" + str(i)
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        s = response.xpath("//script[@type='text/xml']/text()").extract()[0]
        url_list = etree.HTML(s).xpath("""//a/@href""")
        for url in url_list:
            url = "http://www.js.gov.cn" + url
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    """//div[@class='sp_title']/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                authors = response.xpath(
                    """//div[@class='sp_time']/font[2]/text()""").extract()
                author = "".join(authors).replace("来源:", "").strip()
                if author == "":
                    author = "江苏人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_times = response.xpath(
                    """//div[@class='sp_time']/font[1]/text()""").extract()[0]
                public_time = str(str(public_times).replace("发布日期:",
                                                            "")) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='zoom']//text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.js.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #5
0
class moNews(scrapy.Spider):
    name = "hubeiSpider"
    start_url = "http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/",
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            "//div[@class='container']//li/a/@href").extract()
        for detail_url in detail_urls:
            if "./" in detail_url:
                time.sleep(random.uniform(1, 3))
                deurl = "http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/" + str(
                    detail_url).replace("./", "")
                yield scrapy.Request(url=deurl,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    """//*[@class='text-center']/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = str(
                    response.xpath(
                        """//*[@class='list-unstyled list-inline']/li[2]/span/text()"""
                    ).extract()[0]).replace("来源:", "")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = str(
                    response.xpath(
                        """//*[@class='list-unstyled list-inline']/li[1]/span/text()"""
                    ).extract()[0]).replace("发布时间:", "").strip() + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.hubei.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//*[@class='TRS_Editor']//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #6
0
class qhNews(scrapy.Spider):
    name = "qhSpider"
    start_url = [
        "http://www.qh.gov.cn/zwgk/xwdt/qhyw/",
        "http://www.qh.gov.cn/zwgk/xwdt/bmdt/",
        "http://www.qh.gov.cn/zwgk/xwdt/dqdt/",
        "http://www.qh.gov.cn/zwgk/xwdt/jqgz/",
        "http://www.qh.gov.cn/zwgk/xwdt/tzgg/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_list,
                                 headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//div[@class='box11 tabs topline']/div/ul/p[@class='item']/a/@href"
        ).extract()
        for news_url in news_list:
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)
        # if "下一页" in response.xpath("//div[@class='pages']/a/text()").extract():
        #     next_list = response.xpath("//div[@class='pages']/a/@href").extract()
        #     next_list=next_list[len(next_list)-1]
        #     yield scrapy.Request(url=next_list, callback=self.parse_item_page_list, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath(
                    "//h1[@class='blue tc']/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@class='abstract tc']/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "青海省人民政府"
                else:
                    author = author.split("来源:")[1].split("发布时间")[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='details_content']/p//text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.qh.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #7
0
class moNews(scrapy.Spider):
    name = "liaoningSpider"
    start_url = "http://www.ln.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.ln.gov.cn/zfxx/jrln/wzxx2018/index.html",
            callback=self.parse_item_page_list,
            headers=self.header)
        # for i in range(1,24):
        #     url = "http://www.ln.gov.cn/zfxx/jrln/wzxx2018/index_"+str(i)
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            "//ul[@class='list_rul']/li/a/@href").extract()
        for detail_url in detail_urls:
            detail_url = "http://www.ln.gov.cn/zfxx/jrln/wzxx2018/" + detail_url.replace(
                "./", "")
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            # print(text)
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//td[@align="center"]/text()""").extract()
                title = "".join(titles).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                content_author = response.xpath(
                    """//table[@class="time"]//td[@align="left"]/text()"""
                ).extract()
                authors = content_author[0].split("  信息来源:")
                author = str(authors[1])
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//table[@class="time"]//td[@align="left"]/text()"""
                ).extract()
                public_times = str(content_time[0]).split("  信息来源:")
                public_time = str(
                    str(public_times[0]).replace("发布时间:", "").replace(
                        "年", "-").replace("月", "-").replace("日", "") +
                    " 00:00:00").strip()

            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ln.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']/div/p/text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #8
0
class moNews(scrapy.Spider):
    name = "shandongSpider"
    start_url = "http://www.shandong.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.shandong.gov.cn/col/col3199/index.html", callback=self.parse_item_page_list, headers=self.header)
        # 全量爬取历史数据
        # for i in range(1,460):
        #     url = "http://www.shandong.gov.cn/col/col3199/index.html?uid=5836&pageNum="+str(i)
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        s = response.xpath("//script[@type='text/xml']/text()").extract()[0]
        detail_urls = etree.HTML(s).xpath("//a/@href")
        for key in detail_urls:
            yield scrapy.Request(url=key, callback=self.parse, headers=self.header, dont_filter=True)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("""//div[@class='xq-tit']/span/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = str(response.xpath("""//div[@class='R-tit']/span[2]/text()""").extract()[0]).replace("来源:","")

            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.shangdong.gov.cn/"

            try:
                content_detail = response.xpath("""//div[@class='article']//text()""").extract()
                content = "".join(content_detail)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item

            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #9
0
class shanghaiNews(scrapy.Spider):
    name = "shanghaiSpider"
    start_url = ["http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw4411/index.html",
                 "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw18454/index.html",
                 "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw15343/index.html",
                 "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw31406/index.html"]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//ul[@class='uli14 pageList']/li/a/@href").extract()
        for news_url in news_list:
            news_url="http://www.shanghai.gov.cn"+news_url
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)
        # next_page = response.xpath("//li/a[@class='action']/@href").extract()
        # if next_page!=[]:
        #     next_url="http://www.shanghai.gov.cn"+next_page[0]
        #     # time.sleep(random.uniform(3, 5))
        #     yield scrapy.Request(url=next_url, callback=self.parse_item_page_list, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("//div[@id='ivs_title']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath("//small[@class='PBtime']/text()").extract()[0].split("来源:")[1]
                if author == "":
                    author = "上海市人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\s\s\d{1,2}\s:\s\d{1,2})", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("   "," ").replace(" : ",":")+":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='ivs_content']")[0].xpath('string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.shanghai.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #10
0
class bjNews(scrapy.Spider):
    name = "bjSpider"
    start_url = [
        "http://www.beijing.gov.cn/ywdt/zybwdt/",
        "http://www.beijing.gov.cn/ywdt/yaowen/",
        "http://www.beijing.gov.cn/ywdt/gqrd/",
        "http://www.beijing.gov.cn/ywdt/gzdt/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_home,
                                 headers=self.header)

    def parse_item_page_home(self, response):
        yield scrapy.Request(url=response.url,
                             callback=self.parse_item_page_list,
                             headers=self.header,
                             dont_filter=True)
        # max_page = response.text.split("pageCount = ")[1][:4].split(";")[0]
        # for page in range(1, int(max_page)):
        #     news_list_url = response.url+"default_" + str(page)+".htm"
        #     # time.sleep(random.uniform(3, 5))
        #     yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//li[@class='col-md']/a/@href").extract()
        for news_url in news_list:
            if news_url.startswith("http"):
                # time.sleep(random.uniform(3, 5))
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)
            elif news_url.startswith("../../"):
                news_url = "http://www.beijing.gov.cn/" + news_url.replace(
                    "../../", "")
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)
            elif news_url.startswith("../"):
                news_url = "/".join(
                    response.url.split("/")[:-2]) + news_url.replace(
                        "../", "/")
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)
            elif news_url.startswith("./"):
                news_url = "/".join(
                    response.url.split("/")[:-1]) + news_url.replace(
                        "./", "/")
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                # title = response.xpath("//head/title/text()").extract()[0].split("-")[0]
                title_arr = response.xpath(
                    "//div[@class='header']/p/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//p[@class='fl']/span/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "北京市人民政府"
                else:
                    author = author.split("来源:")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='TRS_Editor']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.beijing.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #11
0
class moNews(scrapy.Spider):
    name = "moSpider"
    start_url = "https://www.gov.mo/zh-hant/%s/%s/page/%s/?post_type=news_post"
    header = spiderUtil.header_util()

    def start_requests(self):
        # date_set = set()
        # begin_date = datetime.datetime.strptime("2017-09-01", "%Y-%m-%d")
        # end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        # while begin_date <= end_date:
        #     date_str = begin_date.strftime("%Y-%m")
        #     date_set.add(date_str)
        #     begin_date += datetime.timedelta(days=1)
        # date_list = list(date_set)
        # date_list.sort()
        #
        # for date in date_list:
        #     today = date.split("-")
        today = spiderUtil.get_time().split("-")
        year = today[0]
        month = today[1]
        for page in range(1, 3):
            url = self.start_url % (year, month, page)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_list,
                                 headers=self.header)

    def parse_item_page_list(self, response):
        # news_url_list = response.xpath("//h2/a/@href").extract()
        news_url_list = response.xpath(
            "//div[@class='card-head news--item-head style-primary']/a/@href"
        ).extract()
        for news_url in news_url_list:
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    "//head/title/text()").extract()[0].split("–")[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//dl/dd")[0].xpath(
                    'string(.)').extract()[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            source = "https://www.gov.mo/"

            try:
                content_arr = response.xpath("//article/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #12
0
class heilongjiangNews(scrapy.Spider):
    name = "hebeiSpider"
    start_url = ["http://www.hebei.gov.cn/"]
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.hebei.gov.cn/hebei/11937442/10761139/index.html", callback=self.parse_item_page_list,headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath("//div[2]/div[2]/div/ul/li/a/@href").extract()
        for detail_url in detail_urls:
            if not detail_url.startswith("http"):
                url = "http://www.hebei.gov.cn" + detail_url
                yield scrapy.Request(url=url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath("""//h2[@class="cont_title"]/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author="河北省人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_timess = response.xpath("""//li[@class="xl_shijian"]//text()""").extract()[0]
                public_times = str(public_timess).replace("年", "-").replace("月", "-").replace("日", "").strip()
                public_time = str(public_times)+" 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='zoom']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hebei.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #13
0
class moNews(scrapy.Spider):
    name = "anhuiSpider"
    start_url = "http://www.ah.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.ah.gov.cn/UserData/SortHtml/1/549213957.html",
            callback=self.parse_item_page_list,
            headers=self.header)
        # for i in range(2,524):
        #     url = "http://www.ah.gov.cn/tmp/Nav_nav.shtml?SS_ID=7&tm=29357.31&Page="+str(i)
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        url_list = response.xpath("//div[@class='navjz']//a/@href").extract()
        for url in url_list:
            if "http://www.ah.gov.cn" in url:
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=self.header,
                                     dont_filter=True)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                titles = response.xpath(
                    """//div[@class='wztit']//text()""").extract()
                title = "".join(titles).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[3]""").extract()
                author = "".join(author_arr).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_date = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[1]""").extract()[0]
                content_time = response.xpath(
                    """//div[@class='wzbjxx']/p/text()[2]""").extract()[0]
                public_time = str(content_date) + " " + str(
                    content_time) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ah.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='wzcon']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #14
0
class nxNews(scrapy.Spider):
    name = "tjSpider"
    start_url = [
        "http://www.tj.gov.cn/xw/xwfbh/", "http://www.tj.gov.cn/xw/tztg/",
        "http://www.tj.gov.cn/xw/bum/", "http://www.tj.gov.cn/xw/qx1/",
        "http://www.tj.gov.cn/xw/bdyw/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(2, 3))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_list,
                                 headers=self.header)

    # def parse_item_page_home(self, response):
    #     yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True)
    # max_page = response.text.split("countPage = ")[1][:3].split("/")[0]
    # for page in range(2, int(max_page)):
    #     news_list_url = response.url+"index_" + str(page)+".html"
    #     time.sleep(1)
    #     yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//div/div/ul/li/a/@href").extract()
        for news_url in news_list:
            if news_url.startswith("./"):
                news_url = response.url.split("/index")[0] + news_url[2:]
                time.sleep(1)
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)
            elif news_url.startswith("http"):
                # time.sleep(random.uniform(1, 3))
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath("//div[@class='title']/text()").extract(
                )[0].strip() + response.xpath(
                    "//div[@class='t_title']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    "//span[@class='ly']/text()").extract()[0].split("来源:")[1]
                if author == "":
                    author = "天津市人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\s\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("  ", " ") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='TRS_Editor']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.tj.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #15
0
class moNews(scrapy.Spider):
    name = "fujianSpider"
    start_url = "http://www.fujian.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.fujian.gov.cn/xw/fjyw/",
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        url_list = response.xpath("//ul[@class='list-gl']//a/@href").extract()
        for urls in url_list:
            url = "http://www.fujian.gov.cn/xw/fjyw/" + str(urls).replace(
                "./", "")
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title_arr = response.xpath(
                    """//div[@class='xl-nr clearflx']//h3/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//div[@class='xl-nr clearflx']//h5/span/text()"""
                ).extract()
                author = "".join(author_arr).replace("[",
                                                     "").replace("]",
                                                                 "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class='xl-nr clearflx']//h5/text()""").extract()
                public_time = "".join(content_time).replace("字号:", "").replace(
                    "|", "").strip()
                public_time = str(public_time) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.fujian.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #16
0
class gdNews(scrapy.Spider):
    name = "gdSpider"
    start_url = ["http://www.gd.gov.cn/gdywdt/gdyw/index.html",
                 "http://www.gd.gov.cn/gdywdt/dczl/jcbs/index.html",
                 "http://www.gd.gov.cn/gdywdt/dczl/gcls/index.html",
                 "http://www.gd.gov.cn/gdywdt/dczl/dcxd/index.html",
                 "http://www.gd.gov.cn/gdywdt/bmdt/index.html",
                 "http://www.gd.gov.cn/gdywdt/dsdt/index.html",
                 "http://www.gd.gov.cn/gdywdt/zfjg/index.html",
                 "http://www.gd.gov.cn/gdywdt/tzdt/index.html",
                 "http://www.gd.gov.cn/gdywdt/ydylygd/index.html"]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header)

    def parse_item_page_home(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True)
        # max_page = response.xpath("//a[@class='last']/@href").extract()[0].split("index_")[1].split(".")[0]
        # for page in range(2, int(max_page)+1):
        #     news_list_url = response.url.replace("index","index_" + str(page))
        #     # time.sleep(random.uniform(3, 5))
        #     yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//span[@class='til']/a/@href").extract()
        for news_url in news_list:
            # time.sleep(random.uniform(3, 5))
            if news_url.startswith("http://www.gd.gov.cn"):
                yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath("//h3[@class='zw-title']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//span[@class='ly']/text()").extract()[0].split("来源  :")[1].strip()
                if author == "":
                    author = "广东省人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='zw']/p")[0].xpath('string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.gd.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #17
0
class moNews(scrapy.Spider):
    name = "jilinSpider"
    start_url = "http://www.jl.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.jl.gov.cn/zw/yw/jlyw/index.html",
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            """//li[@class="item"]//a/@href""").extract()
        for detail_url in detail_urls:
            if "./" in detail_url:
                detail_url = "http://www.jl.gov.cn/zw/yw/jlyw/" + detail_url.replace(
                    "./", "")
                time.sleep(random.uniform(1, 3))
                yield scrapy.Request(url=detail_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//div[@id="dbt"]//text()""").extract()
                title = "".join(content_title).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = "吉林省人民政府"

            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class="c_xx"]//text()""").extract()
                public_times = str(content_time[0]).split("   ")
                public_time = str(public_times[1])
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jl.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@class='TRS_Editor']//text()""").extract()
                content = "".join(content_detail).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #18
0
class cqNews(scrapy.Spider):
    name = "cqSpider"
    start_url = [
        "http://www.cq.gov.cn/zqfz/whly", "http://www.cq.gov.cn/zqfz/sthj",
        "http://www.cq.gov.cn/zqfz/shfz", "http://www.cq.gov.cn/zqfz/gmjj",
        "http://www.cq.gov.cn/zwxx/jrcq", "http://www.cq.gov.cn/zwxx/zwdt"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # for page in range(1,473):
            for page in range(1, 2):
                # time.sleep(random.uniform(2, 3))
                yield scrapy.Request(url=url + "_" + str(page),
                                     callback=self.parse_item_page_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_page_home(self, response):
        print("=====" * 40)
        print(response.text)
        # max_page = response.xpath("//span[@class='total']/text()").extract()[0].split("/")[1][1:-1]
        # for page in range(1, int(max_page) + 1):
        #     url = response.url + "_" + str(page)
        #     time.sleep(random.uniform(2, 3))
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header,dont_filter=True)

    def parse_item_page_list(self, response):
        print(response.text)
        print("=" * 100)
        news_list = response.xpath("//ul[@class='list']/li/a/@href").extract()
        for news_url in news_list:
            news_url = "http://www.cq.gov.cn" + news_url
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title_arr = response.xpath(
                    "//h2[@class='title']/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author_arr = response.xpath(
                    "//span[@class='fl']/span/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "重庆市人民政府"
                else:
                    author = author.split("来源:")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='conTxt']")[0].xpath('string(.)').extract()
                content = "".join(content_arr).split("终审 :")[0].strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.cq.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #19
0
class hnNews(scrapy.Spider):
    name = "hnSpider"
    start_url = [
        "http://www.hainan.gov.cn/hainan/tingju/list3.shtml",
        "http://www.hainan.gov.cn/hainan/sxian/list3.shtml",
        "http://www.hainan.gov.cn/hainan/5309/list3.shtml",
        "http://www.hainan.gov.cn/hainan/mtkhn/list3.shtml",
        "http://www.hainan.gov.cn/hainan/ldhd/sj_list3.shtml"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_home,
                                 headers=self.header)

    def parse_item_page_home(self, response):
        yield scrapy.Request(url=response.url,
                             callback=self.parse_item_page_list,
                             headers=self.header,
                             dont_filter=True)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//div[@class='list-right_title fon_1']/a/@href").extract()
        for news_url in news_list:
            if not news_url.startswith("http"):
                news_url = "http://www.hainan.gov.cn" + news_url
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath(
                    "//div[@class='title_cen mar-t2 text']/ucaptitle/text()"
                ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//span[@id='ly']/text()").extract()
                if author == []:
                    author = "海南省人民政府"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='zoom']/div[@id='font']/ucapcontent/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hainan.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #20
0
class moNews(scrapy.Spider):
    name = "zhejiangSpider"
    start_url = "http://www.zj.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.zj.gov.cn/col/col1554467/index.html",
            callback=self.parse_item_page_list,
            headers=self.header)

    def parse_item_page_list(self, response):
        s = response.xpath("//script[@type='text/xml']/text()").extract()[0]
        url_list = etree.HTML(s).xpath("""//a/@href""")
        for url in url_list:
            url = "http://www.zj.gov.cn" + url
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//td[@align="center"]/text()""").extract()
                title = "".join(content_title)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                authors = response.xpath(
                    """//ul[@class="list"]/li[2]/text()""").extract()
                author = "".join(authors).replace("来源:", "").strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//ul[@class="list"]/li[1]/text()""").extract()
                public_time = str(content_time[0]).replace("发布日期:", "").strip()
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jl.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@id="zoom"]//text()""").extract()
                content = ""
                for i in range(0, len(content_detail)):
                    content = content + content_detail[i]
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #21
0
class sxNews(scrapy.Spider):
    name = "sxSpider"
    start_url = ["http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10001",
                 "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10003",
                 "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10002",
                 "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=17469"]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//ul[@class='xwlist-ul']/li/a/@href").extract()
        for news_url in news_list:
            news_url="http://www.shaanxi.gov.cn"+news_url
            yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)
        # next_list = response.xpath("//form/span/a/@href").extract()
        # next_list=response.url.split("?")[0]+next_list[-2]
        # yield scrapy.Request(url=next_list, callback=self.parse_item_page_list, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath("//h1[@class='news_h1']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//span[@id='info_source']/text()").extract()[0].strip()
                if author == "":
                    author = "陕西省人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='info-cont']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.shanxi.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #22
0
class nxNews(scrapy.Spider):
    name = "nxSpider"
    start_url = [
        "http://www.nx.gov.cn/zwxx_11337/wztt/",
        "http://www.nx.gov.cn/zwxx_11337/zwdt/",
        "http://www.nx.gov.cn/zwxx_11337/sxdt/",
        "http://www.nx.gov.cn/zwxx_11337/hygq/",
        "http://www.nx.gov.cn/ztsj/zt/tpgj_1542/",
        "http://www.nx.gov.cn/ztsj/zt/hjbhdc/",
        "http://www.nx.gov.cn/zwxx_11337/zcjd/",
        "http://www.nx.gov.cn/zwgk/tzgg/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_home,
                                 headers=self.header)

    def parse_item_page_home(self, response):
        # text = response.text
        # max_page = text.split("createPageHTML(")[2].split(",")[0]
        # for page in range(1, int(max_page)+1):
        # for page in range(1, 2):
        news_list_url = response.url + "index" + ".html"
        # time.sleep(random.uniform(3, 5))
        yield scrapy.Request(url=news_list_url,
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//ul[@class='commonList_dot']/li/a/@href").extract()
        for news_url in news_list:
            if news_url.startswith("./"):
                news_url = response.url.split("index")[0] + news_url[2:]
            elif news_url.startswith("../"):
                news_url = "http://www.nx.gov.cn/zwxx_11337" + news_url[2:]
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath(
                    "//div[@id='info_title']/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//span[@id='info_source']/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "宁夏回族自治区人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                # spiderUtil.log_level(8, response.url)
                pass

            try:
                content_arr = response.xpath(
                    "//div[@class='view TRS_UEDITOR trs_paper_default trs_word trs_key4format']/p//text()"
                ).extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.nx.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #23
0
class nmgNews(scrapy.Spider):
    name = "nmgSpider"
    start_url = [
        "http://www.nmg.gov.cn/col/col442/index.html",
        "http://www.nmg.gov.cn/col/col443/index.html",
        "http://www.nmg.gov.cn/col/col1141/index.html",
        "http://www.nmg.gov.cn/col/col1972/index.html",
        "http://www.nmg.gov.cn/col/col1973/index.html",
        "http://www.nmg.gov.cn/col/col365/index.html",
        "http://www.nmg.gov.cn/col/col151/index.html",
        "http://www.nmg.gov.cn/col/col152/index.html",
        "http://www.nmg.gov.cn/col/col360/index.html",
        "http://www.nmg.gov.cn/col/col1253/index.html",
        "http://www.nmg.gov.cn/col/col359/index.html",
        "http://www.nmg.gov.cn/col/col389/index.html"
    ]
    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # for page in range(1, 329):
            for page in range(1, 2):
                news_list = url + "?uid=777&pageNum=" + str(page)
                # time.sleep(random.uniform(1, 3))
                yield scrapy.Request(url=news_list,
                                     callback=self.parse_item_page_list,
                                     headers=self.header)

    def parse_item_page_list(self, response):
        s = response.xpath("//script[@type='text/xml']/text()").extract()[0]
        url_list = etree.HTML(s).xpath("//a/@href")
        for url in url_list:
            url = "http://www.nmg.gov.cn/" + url
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    "//div[@class='main-fl-tit']/text()").extract()
                title = "".join("".join(title).split())
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author_arr = response.xpath(
                    "//div[@class='main-fl-bjxx']/div/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "内蒙古自治区人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='zoom']")[0].xpath(
                    'string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.nmg.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #24
0
class moNews(scrapy.Spider):
    name = "jiangxiSpider"
    start_url = "http://www.jiangxi.gov.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://www.jiangxi.gov.cn/col/col393/index.html",
            callback=self.parse_item_page_list,
            headers=self.header)
        # 全量爬取历史数据
        # for i in range(2,917):
        #     url = "http://www.jiangxi.gov.cn/col/col393/index.html?uid=45663&pageNum="+str(i)
        #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        s = response.xpath("//script[@type='text/xml']/text()").extract()[0]
        detail_urls = etree.HTML(s).xpath("//a/@href")
        for detail_url in detail_urls:
            detail_url = "http://www.jiangxi.gov.cn" + detail_url
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_title = response.xpath(
                    """//div[@class='artile_zw']/div/p/text()""").extract()
                title = "".join(content_title).strip()
                print(title)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    """//div[@id='zoom']//font/text()[2]""").extract()[0]

            except:
                spiderUtil.log_level(9, response.url)

            try:
                content_time = response.xpath(
                    """//div[@class='sp_time screen']/font[1]/text()"""
                ).extract()[0]
                public_time = str(content_time).replace("发布时间:", "")
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.jiangxi.gov.cn/"

            try:
                content_detail = response.xpath(
                    """//div[@id='zoom']/p/text()""").extract()
                content = "".join(content_detail)
            except:
                spiderUtil.log_level(7, response.url)

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status_code, response.url)
Beispiel #25
0
class gxNews(scrapy.Spider):
    name = "gxSpider"
    start_url = [
        "http://www.gxzf.gov.cn/zwhd/index.shtml",
        "http://www.gxzf.gov.cn/sytt/index.shtml",
        "http://www.gxzf.gov.cn/zcjd/index.shtml",
        "http://www.gxzf.gov.cn/gggs/index.shtml",
        "http://www.gxzf.gov.cn/zwdc/index.shtml",
        "http://www.gxzf.gov.cn/dflz/yw/index.shtml"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_home,
                                 headers=self.header)

    def parse_item_page_home(self, response):
        yield scrapy.Request(url=response.url,
                             callback=self.parse_item_page_list,
                             headers=self.header,
                             dont_filter=True)
        # max_page = response.xpath("//div[@class='more-page']/a/@href").extract()[-1].split("-")[1][:-6]
        # for page in range(2, int(max_page)+1):
        #     news_list_url = response.url.replace("index","index-" + str(page))
        #     # time.sleep(random.uniform(3, 5))
        #     yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//ul[@class='more-list']/li/a/@href").extract()
        for news_url in news_list:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath(
                    "//div[@class='article']/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@class='article-inf-left']/text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "广西壮族自治区人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='article-con']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.gxzf.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #26
0
class ynNews(scrapy.Spider):
    name = "ynSpider"
    start_url = [
        "http://www.yn.gov.cn/ywdt/bmdt/", "http://www.yn.gov.cn/ywdt/ynyw/",
        "http://www.yn.gov.cn/ywdt/zsdt/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_list,
                                 headers=self.header)
            # for i in range(1,100):
            # for i in range(1,2):
            #     url =url.split("index")[0]+("index_"+str(i))+".html"
            #     yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//dl[@class='thlist']/dt/a/@href").extract()
        for news_url in news_list:
            if news_url.startswith("./"):
                news_url = response.url.split("index")[0] + news_url[2:]
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath(
                    "//div[@class='articl']/h3//text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    "//div[@class='datetime']/text()").extract()[0].split(
                        "来源:")[1].split("2")[0].strip()
                if author == "":
                    author = "云南省人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='TRS_Editor']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.yn.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #27
0
class scNews(scrapy.Spider):
    name = "scSpider"
    start_url = ["http://www.sc.gov.cn/10462/10705/10709/xwfbt_list.shtml",
                 "http://www.sc.gov.cn/10462/10705/10708/xwfbt_list.shtml",
                 "http://www.sc.gov.cn/10462/10705/10707/xwfbt_list.shtml",
                 "http://www.sc.gov.cn/10462/10705/10706/xwfbt_one.shtml"]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header)
            # if "list" in url:
            #     for page in range(1, 6):
            #         list_url = url.split(".shtml")[0]+"_"+str(page)+".shtml"
            #         yield scrapy.Request(url=list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath("//td/a/@href").extract()
        for url in news_list:
            if not url.startswith("http"):
                news_url = "http://www.sc.gov.cn"+url
                yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title_arr = response.xpath("//h2/ucaptitle/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath("//ul[@id='articleattribute']/li/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "四川省人民政府"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}时\d{1,2}分)", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("时",":").replace("分","")+":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("//div[@id='cmsArticleContent']")[0].xpath('string(.)').extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.sc.gov.cn/"

            try:
                if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(
                        spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Beispiel #28
0
class gzNews(scrapy.Spider):
    name = "gzSpider"
    start_url = [
        "http://www.guizhou.gov.cn/xwdt/rmyd/",
        "http://www.guizhou.gov.cn/xwdt/jrgz/",
        "http://www.guizhou.gov.cn/xwdt/gzyw/",
        "http://www.guizhou.gov.cn/xwdt/qgyw/",
        "http://www.guizhou.gov.cn/xwdt/mtkgz/",
        "http://www.guizhou.gov.cn/xwdt/djfb/",
        "http://www.guizhou.gov.cn/xwdt/tzgg/",
        "http://www.guizhou.gov.cn/xwdt/szf/ldjh/",
        "http://www.guizhou.gov.cn/xwdt/szf/ldhd/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/bm/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/gy/index.html",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/zy/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/lps/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/as/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/bj/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/tr/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/qdn/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/qn/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/qxn/",
        "http://www.guizhou.gov.cn/xwdt/dt_22/df/gaxq/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_page_home,
                                 headers=self.header)

    def parse_item_page_home(self, response):
        yield scrapy.Request(url=response.url,
                             callback=self.parse_item_page_list,
                             headers=self.header,
                             dont_filter=True)
        # max_page = response.xpath("//div[@class='page']/script/text()").extract()[0].split("HTML(")[1].split(",")[0]
        # for page in range(1, int(max_page)):
        #     news_list_url = response.url + "index_" + str(page) + ".html"
        #     # time.sleep(random.uniform(3, 5))
        #     yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header)

    def parse_item_page_list(self, response):
        news_list = response.xpath(
            "//div[@class='right-list-box']/ul/li/a/@href").extract()
        for news_url in news_list:
            # time.sleep(random.uniform(3, 5))
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                title = response.xpath("//h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath(
                    "//head/meta[@name='ContentSource']/@content").extract()
                if author == []:
                    author = "贵州省人民政府"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='view TRS_UEDITOR trs_paper_default trs_web']/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.guizhou.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
                    # print(item)
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
class heilongjiangNews(scrapy.Spider):
    name = "heilongjiangSpider"
    start_url = ["http://www.hlj.gov.cn/"]
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.hlj.gov.cn/zwfb/zxfb/index.shtml",
                             callback=self.parse_item_page_list,
                             headers=self.header)
        yield scrapy.Request(url="http://www.hlj.gov.cn/szf/lddt/cxhd/",
                             callback=self.parse_item_page_list,
                             headers=self.header)

    def parse_item_page_list(self, response):
        detail_urls = response.xpath(
            "//div[@class='li-left hei']//span/a/@href").extract()
        for detail_url in detail_urls:
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                title = response.xpath(
                    """//div[@class="tm2"]/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                authors = response.xpath(
                    """//div[@class="tm3"]/span[2]/text()""").extract()
                author = "".join(authors).replace("来源:", "").strip()
                if author == "":
                    author = "黑龙江人民政府网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                public_times = response.xpath(
                    """//div[@class="tm3"]/span[1]/text()""").extract()
                public_time = "".join(public_times).replace("时间:", "").strip()
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='nr5']/p/text()").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.hlj.gov.cn/"

            try:
                if len(content) > 50 and (
                        public_time.startswith(spiderUtil.get_first_hour()) or
                        public_time.startswith(spiderUtil.get_first_twohour())
                        or public_time.startswith(
                            spiderUtil.get_first_threehour())):
                    item = GovAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)