Esempio n. 1
0
class eNews(scrapy.Spider):
    name = "ePaperSpider"
    start_url = "http://epaper.ssrb.com.cn/html/%s-%s/%s/node_1.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2000-01-01", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath("//a[@id='pageLink']/@href").extract()
        for paper in paper_list:
            paper_url = response.url.split("node")[0] + paper
            yield scrapy.Request(url=paper_url,
                                 callback=self.parse_item_list,
                                 headers=self.header,
                                 dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath("//div/a/@href").extract()
        for news in news_list:
            news_url = response.url.split("node")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='ozoom']/founder-content/text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.ssrb.com.cn/"

            try:
                author = "石狮日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//td[@class='font01']/founder-title/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 2
0
class ddNews(scrapy.Spider):
    name = "ddPaperSpider"
    start_url = "http://szb.ddswcm.com/Html/%s-%s-%s/Qpaper.html"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2016-10-10", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath(
            "//div[@class='verpaper']/ul/li/a/@href").extract()
        for paper in paper_list:
            if paper.startswith("Qpaper"):
                paper_url = response.url.split("Qpaper")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse_item_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//div[@class='vertitle']/ul/li/a/@href").extract()
        for news in news_list:
            # if news.startswith("content"):
            news_url = response.url.split("Qpaper")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='content']/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://szb.ddswcm.com/"

            try:
                author = response.xpath(
                    "//div[@class='property']//text()").extract()[0].split(
                        "作者:")[1].split("浏览次数")[0].strip()
                if author == "":
                    author = "当代商报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@class='papertitle']//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 3
0
class zwNews(scrapy.Spider):
    name = "zwPaperSpider"
    start_url = "http://124.224.204.62:8081/szb/pc/%s%s/%s/l1.html"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2009-02-03", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath("//li[@class='posRelative']/a/@href").extract()
        for paper in paper_list:
            paper_url = response.url.split("l1")[0] + paper
            yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath("//li[@class='clearfix']/a/@href").extract()
        for news in news_list:
            yield scrapy.Request(url=news, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//founder-content//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://124.224.204.62:8081/szb/pc/"

            try:
                author_arr = response.xpath("//h3[@class='title-author']/text()").extract()
                if author_arr == []:
                    author = "中卫日报"
                else:
                    author=author_arr[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//h2[@id='Title']/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 4
0
class yaNews(scrapy.Spider):
    name = "yaPaperSpider"
    start_url = "http://paper.yanews.cn/yarb/%s%s%s/html/index.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2016-12-13", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath("//a[@class='bmml_con_div_name']/@href").extract()
        for paper in paper_list:
            paper_url = response.url.split("index")[0] + paper
            yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header,
                                 dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath("//a[@class='bmdh_con_a']/@href").extract()
        for news in news_list:
            if news.startswith("index"):
                news_url = response.url.split("index")[0] + news
                yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)
            if news.startswith("page"):
                news_url = response.url.split("page")[0] + news
                yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace("年", "-").replace(
                    "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@id='zoom']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://paper.yanews.cn/"

            try:
                author = "延安日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div[@class='bmnr_con_biaoti']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 5
0
class qhNews(scrapy.Spider):
    name = "qhPaperSpider"
    start_url = "http://www.qhdb.com.cn/Newspaper/PageNavigate.aspx?nid=%s"
    header = spiderUtil.header_util()

    def start_requests(self):
        for date in range(15, 2553):
            url = self.start_url % date
            time.sleep(1)
            # 全量数据
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)
            # 增量数据
            # yield scrapy.Request(url=url, callback=self.parse_item_today, headers=self.header)

    # 增量数据拿到今天日报页面
    def parse_item_today(self, response):
        today_list = response.xpath("//tr/td/a/@href").extract()[-1]
        today_url = response.url.split("PageNavigate")[0] + today_list
        yield scrapy.Request(url=today_url,
                             callback=self.parse_item_home,
                             headers=self.header,
                             dont_filter=True)

    def parse_item_home(self, response):
        paper_list = response.xpath(
            "//div/span[@class='float']/a/@href").extract()
        for paper in paper_list:
            paper_url = response.url.split("PageNavigate")[0] + paper
            yield scrapy.Request(url=paper_url,
                                 callback=self.parse_item_list,
                                 headers=self.header,
                                 dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//div[@class='p_l_bottom']/div/a/@href").extract()
        for news in news_list:
            news_url = response.url.split("PageNavigate")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("/", "-")
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='article_content']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.qhdb.com.cn/"

            try:
                author_arr = response.xpath(
                    "//span[@id='sZuoZhe']/text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "期货日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div[@class='article_title']//text()"
                                       ).extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 6
0
class tsNews(scrapy.Spider):
    name = "tsPaperSpider"
    start_url = "http://dzb.tsrb.com.cn/tswb/content/%s%s%s/Page%sHO.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2010-06-21", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            for page in range(1, 17):
                if page < 10:
                    page = "0" + str(page)
                url = self.start_url % (year, month, day, page)
                time.sleep(1)
                yield scrapy.Request(url=url,
                                     callback=self.parse_item_list,
                                     headers=self.header)

    def parse_item_list(self, response):
        paper_list = response.xpath("//tr/td/a/@href").extract()
        for paper in paper_list:
            if paper.startswith("Artice"):
                paper_url = response.url.split("Page")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse,
                                     headers=self.header,
                                     dont_filter=True)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//span[@id='contenttext']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.tsrb.com.cn/"

            try:
                author = "天水晚报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@class='detailtitle']//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 7
0
class qlNews(scrapy.Spider):
    name = "qlPaperSpider"
    start_url = "http://epaper.qlwb.com.cn/qlwb/content/%s%s%s/PageArticleIndexLB.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2019-04-12", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_list,
                                 headers=self.header)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//div[@class='linkto']/ul/li/a/@title").extract()
        for news in news_list:
            news_url = response.url.split("PageArticleIndex")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='contenttext']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.qlwb.com.cn/"

            try:
                author = "齐鲁晚报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//h2//text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 8
0
class jjNews(scrapy.Spider):
    name = "jjPaperSpider"
    start_url = "http://epaper.21jingji.com/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url=self.start_url,
                             callback=self.parse_item_list,
                             headers=self.header)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//div[@class='main']/ul/li/a/@href").extract()
        for news in news_list:
            yield scrapy.Request(url=news,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='txtContent']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://epaper.21jingji.com/"

            try:
                author_arr = response.xpath(
                    "//div[@class='newsInfo']//text()").extract()
                author = "".join(author_arr).split(" ")[1].strip()
                if author == "":
                    author = "21世纪经济报道数字报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr1 = response.xpath(
                    "//div[@class='titleHead']/h1//text()").extract()
                title = "".join(title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 9
0
class ckNews(scrapy.Spider):
    name = "ckPaperSpider"
    start_url = "http://dz.jjckb.cn/www/pages/webpage2009/html/%s-%s/%s/node_2.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2018-01-02", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        for page in range(2, 10):
            paper = "node_" + str(page) + ".htm"
            if paper.startswith("node"):
                paper_url = response.url.split("node")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse_item_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath("//a[@class='hei12']/@href").extract()
        for news in news_list:
            if news.startswith("content"):
                news_url = response.url.split("node")[0] + news
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//founder-content/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://dz.jjckb.cn/www/pages/webpage2009/"

            try:
                author_arr = response.xpath(
                    "//td[@class='black12']//text()").extract()
                author = "".join(author_arr).split("来源:")[1].strip()
                if author == "":
                    author = "经济参考报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//tr/td[@class='hei16b']//text()").extract()
                title_arr1 = response.xpath(
                    "//tr/td[@class='hui12']//text()").extract()
                title = "".join(title_arr).strip() + "".join(
                    title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 10
0
class hzNews(scrapy.Spider):
    name = "hzPaperSpider"
    start_url = "https://hzdaily.hangzhou.com.cn/hzrb/%s/%s/%s/page_list_%s%s%s.html"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2017-01-01", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day, year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath(
            "//li/a[@target='_parent']/@href").extract()
        for paper in paper_list:
            if paper.startswith("page"):
                paper = paper.replace("page_detail", "article_list")
                paper_url = response.url.split("page")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse_item_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//ul[@class='page-list']/li/a/@href").extract()
        for news in news_list:
            if news.startswith("article_detail"):
                news_url = response.url.split("article_list")[0] + news
                yield scrapy.Request(url=news_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='content']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "https://hzdaily.hangzhou.com.cn/hzrb/"

            try:
                author = "杭州日报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr1 = response.xpath(
                    "//div[@class='head']/h1//text()").extract()
                title_arr2 = response.xpath(
                    "//div[@class='head']/h2//text()").extract()
                title_arr3 = response.xpath(
                    "//div[@class='head']/h3//text()").extract()
                title = "".join(title_arr1).strip() + "".join(
                    title_arr2).strip() + "".join(title_arr3).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 11
0
class rdNews(scrapy.Spider):
    name = "rdPaperSpider"
    start_url = "http://media.workercn.cn/sites/media/jlgrb/%s_%s/%s/GR0100.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2016-01-05", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath("//table/tr/td/a/@href").extract()
        for paper in paper_list:
            if paper.startswith("GR"):
                paper_url = response.url.split("GR")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse_item_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath("//ul/li/a/@href").extract()
        for news in news_list:
            news_url = response.url.split("GR")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='ozoom']/div/span/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://media.workercn.cn/"

            try:
                author_arr = response.xpath(
                    "//div[@class='lai']/span//text()").extract()
                author = "".join(author_arr).strip().split("(")[0]
                if author == "":
                    author = "劳动新闻"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//h1//text()").extract()[0].strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Esempio n. 12
0
class bdNews(scrapy.Spider):
    name = "bdPaperSpider"
    start_url = "http://bddsb.bandao.cn/pc/bddsb/%s%s%s/PageA01BC.htm"
    header = spiderUtil.header_util()

    def start_requests(self):
        list = []
        begin_date = datetime.datetime.strptime("2018-01-01", "%Y-%m-%d")
        end_date = datetime.datetime.strptime(
            time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d")
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y-%m-%d")
            list.append(date_str)
            begin_date += datetime.timedelta(days=1)
        for date in list:
            data_info = date.split("-")
            year = data_info[0]
            month = data_info[1]
            day = data_info[2]
            url = self.start_url % (year, month, day)
            time.sleep(1)
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        paper_list = response.xpath(
            "//div[@class='banmianlist_box']/a/@href").extract()
        for paper in paper_list:
            if paper.startswith("Page"):
                paper_url = response.url.split("Page")[0] + paper
                yield scrapy.Request(url=paper_url,
                                     callback=self.parse_item_list,
                                     headers=self.header,
                                     dont_filter=True)

    def parse_item_list(self, response):
        news_list = response.xpath(
            "//div[@class='bd_newslist']/a/@href").extract()
        for news in news_list:
            # if news.startswith("content"):
            news_url = response.url.split("Page")[0] + news
            yield scrapy.Request(url=news_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})",
                                        response.text).group(0).replace(
                                            "年", "-").replace(
                                                "月", "-") + " 00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//span[@id='contenttext']//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://bddsb.bandao.cn/pc/bddsb/"

            try:
                author = "半岛都市报"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@class='neirong']/h3//text()").extract()
                title_arr1 = response.xpath(
                    "//div[@class='neirong']/h2//text()").extract()
                title = "".join(title_arr).strip() + "".join(
                    title_arr1).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                #  and public_time.startswith(spiderUtil.get_first_hour()):
                if len(content) > 50:
                    item = PaperAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)