Exemple #1
0
class bjNews(scrapy.Spider):
    name = "chinanetNewsSpider"
    start_url = "http://www.china.com.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://news.china.com.cn/node_7247300.htm",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """/html/body/div/div/ul/li/a/@href""").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                # content_time = response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)

            except:
                spiderUtil.log_level(8, response.url)

            try:
                contents = response.xpath(
                    """//*[@id="articleBody"]/p/text()""").extract()
                content = "".join(contents)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.china.com.cn/"

            try:
                author_arr = response.xpath(
                    """//*[@id="source_baidu"]//text()""").extract()
                author = "".join(author_arr)
                if author == '':
                    author = "中国网"
                else:
                    author = author.split("来源:")[1].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "/html/body/div/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #2
0
class chinaNews(scrapy.Spider):
    name = "chinaSpider"
    start_url = "http://www.chinanews.com/scroll-news/news1.html"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url=self.start_url,
                             callback=self.parse_item_list_news,
                             headers=self.header)

    def parse_item_list_news(self, response):
        detail_urls = response.xpath(
            """//*[@id="content_right"]/div/ul/li/div/a/@href""").extract()
        for detail_url in detail_urls:
            if len(detail_url) > 40:
                # time.sleep(3)
                url = "http:" + str(detail_url)
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("年", "-").replace(
                        "月", "-").replace("日", "") + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath(
                    "//div[@class='left_zw']/p//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath(
                    "//div/div/div/div/h1//text()").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//div[@class='left-t']//text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "中国新闻网"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.chinanews.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #3
0
class bjNews(scrapy.Spider):
    name = "bjnewsSpider"
    start_url = "http://www.bjnews.com.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url=self.start_url, callback=self.parse_item_home,headers=self.header)

    def parse_item_home(self, response):
        list_page_arr1 = response.xpath("//div[@class='nav']/a/@href").extract()
        list_page_arr2 = response.xpath("//div[@class='menu_drop_list']/a/@href").extract()
        for list_page in list_page_arr1:
            if not list_page.startswith("http") and list_page != "wevideo" and list_page != "video":
                yield scrapy.Request(url=response.url + list_page[1:], callback=self.parse_item_page_list)
        for list_page in list_page_arr2:
            yield scrapy.Request(url=response.url + list_page[1:], callback=self.parse_item_page_list)

    def parse_item_page_list(self, response):
        for page in range(1, 3):
            yield scrapy.Request(url=response.url + "?page=" + str(page), callback=self.parse_item_news_list)

    def parse_item_news_list(self, response):
        news_url_arr1 = response.xpath("//ul[@id='news_ul']/li/a/@href").extract()
        news_url_arr2 = response.xpath("//ul[@id='news_ul']/li/div/a/@href").extract()
        news_url_arr1.extend(news_url_arr2)
        for news_url in news_url_arr1:
            if news_url.startswith("http"):
                time.sleep(random.uniform(1, 2))
                yield scrapy.Request(url=news_url, callback=self.parse)
            else:
                news_url = "http://www.bjnews.com.cn" + news_url
                time.sleep(random.uniform(1, 2))
                yield scrapy.Request(url=news_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("//div[@class='content']/p/text()").extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.bjnews.com.cn/"

            try:
                author = response.xpath("//span[@class='author']/text()").extract()[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div[@class='title']/h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):

                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #4
0
class bjNews(scrapy.Spider):
    name = "baiduNewsSpider"
    start_url = "http://news.baidu.com.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="https://news.baidu.com/guonei", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/guoji", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/mil", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/finance", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/ent", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/sports", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/internet", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/tech", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/game", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/lady", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/auto", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/auto", callback=self.parse_item_home,headers=self.header)
        yield scrapy.Request(url="https://news.baidu.com/house", callback=self.parse_item_home,headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath("""//a/@href""").extract()
        for detail_url in detail_urls:
            if detail_url.startswith("http://baijiahao.baidu.com"):
                time.sleep(random.uniform(1, 2))
                yield scrapy.Request(url=detail_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath("""//*[@id="article"]/div/div/div/span/text()""").extract()
                public_time = str(time.strftime('%Y', time.localtime(time.time()))) +"-"+ str(content_time[0]) + " " + str(content_time[1]) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("""//*[@id="article"]/div/p/span/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.baidu.com/"

            try:
                author_arr = response.xpath("""//*[@id="article"]/div/div/p/text()""").extract()
                author = "".join(author_arr)
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("""//*[@id="article"]/div/h2/text()""").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #5
0
class wenmingNews(scrapy.Spider):
    name = "wenmingNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.wenming.cn/a/yw/",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """/html/body/div/div/ul/li/div/a/@href""").extract()
        for detail_url in detail_urls:
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",
                                        response.text).group(0) + "00:00:00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@class='TRS_Editor']/div/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.wenming.cn/"

            try:
                author_arr = response.xpath(
                    "//div[@class='box01']/div[@class='fl']/a//text()"
                ).extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "文明网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//div[@id='title_tex']//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_yesterday_date()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #6
0
class dazhongNews(scrapy.Spider):
    name = "dazhongSpider"
    start_url = [
        "http://www.dzwww.com/xinwen/guoneixinwen/",
        "http://www.dzwww.com/xinwen/guojixinwen/",
        "http://www.dzwww.com/xinwen/shehuixinwen/"
    ]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_list_news,
                                 headers=self.header)

    def parse_item_list_news(self, response):
        detail_urls = response.xpath("""//h3/a/@href""").extract()
        for detail_url in detail_urls:
            if detail_url.startswith("./"):
                url = response.url + detail_url.replace("./", "")
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath(
                    """//div/div/div/div/div/p/text()""").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//div/div/h2/text()").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    """//*[@id="xl-headline"]/div/div/text()""").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "大众网"
                else:
                    author = author.split("来源: ")[1].split("作者:")[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.dzwww.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #7
0
class huanqiuNews(scrapy.Spider):
    name = "huanqiuSpider"
    start_url = ["http://world.huanqiu.com/article/?agt=15438",
                 "http://china.huanqiu.com/article/?agt=15438"]

    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            yield scrapy.Request(url=url, callback=self.parse_item_list_news, headers=self.header)

    def parse_item_list_news(self, response):
        detail_urls = response.xpath("""/html/body/div/div/div/ul/li/h5/em/a/@href""").extract()
        for detail_url in detail_urls:
            if "article" in detail_url:
                yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("""/html/body/div/div/div/div/div/p/text()""").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("/html/body/div/div/div/div/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath("""/html/body/div/div/div/div/div/span/a/text()""").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "环球网"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.huanqiu.com/"

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #8
0
class fenghuangNews(scrapy.Spider):
    name = "fenghuangNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        url = "http://news.ifeng.com/"
        yield scrapy.Request(url=url, callback=self.parsepage, headers=self.header)

    def parsepage(self,response):
        newslist = response.xpath("//h2/a/@href").extract()
        for url in newslist:
            newsurl = "http://" + url
            yield scrapy.Request(newsurl, callback=self.parsebody)

    def parsebody(self,response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            all_arr = response.xpath("""//script//text()""").extract()
            data = "".join(all_arr).split("allData = ")[1].split("var adData")[0].strip()[:-1]
            data = json.loads(data)
            doc = data['docData']
            try:
                public_time = doc['newsTime']

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content = doc['contentData']['contentList'][0]['data']
                content = "".join(etree.HTML(content).xpath("//p//text()")).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.ifeng.com/"

            try:
                author = doc['source']
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = doc['title']
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
class eastmoneyNews(scrapy.Spider):
    name = "eastmoneySpider"
    start_url = "http://finance.eastmoney.com/yaowen.html"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url=self.start_url, callback=self.parse_item_list_news,headers=self.header)

    def parse_item_list_news(self, response):
        detail_urls = response.xpath("""//p[@class='title']/a/@href""").extract()
        for detail_url in detail_urls:
            yield scrapy.Request(url=detail_url, callback=self.parse,headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace("月", "-").replace("日", "") + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("//div[@id='ContentBody']/p//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//h1//text()").extract()
                title = "".join(title_arr)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath("//div[@class='source data-source']//text()").extract()
                author = "".join(author_arr).replace("来源:","").strip()
                if author == "":
                    author = "东方财富网"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://finance.eastmoney.com/"

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #10
0
class wangyiNews(scrapy.Spider):
    name = "wangyiNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://money.163.com/special/002526BH/rank.html",
            callback=self.parsepage,
            headers=self.header)

    def parsepage(self, response):
        newslist = response.xpath("//div/table/tr/td/a")
        for news in newslist:
            url = news.xpath("./@href").extract()[0]
            title = news.xpath("./text()").extract()[0].strip()
            yield scrapy.Request(url,
                                 callback=self.parsebody,
                                 meta={"title": title})

    def parsebody(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})",
                    response.text).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='endText']/p/text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            try:
                author_arr = response.xpath(
                    "//a[@id='ne_article_source']//text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "网易新闻"
            except:
                spiderUtil.log_level(9, response.url)

            source = "https://news.163.com/"

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = author
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #11
0
class ceNews(scrapy.Spider):
    name = "ceSpider"
    start_url = "http://intl.ce.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url=self.start_url, callback=self.parse_item_home, headers=self.header)

    def parse_item_home(self, response):
        kinds_arr = response.xpath("//div[@class='ceallnava']/ul/li/a/@href").extract()
        for kinds in kinds_arr:
            if kinds != "http://intl.ce.cn/specials/":
                yield scrapy.Request(url=kinds, callback=self.parse_item_kinds)

    def parse_item_kinds(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, dont_filter=True)

    def parse_item_page_list(self, response):
        news_url_arr = response.xpath("//span[@class='f1']/a/@href").extract()
        for news_url in news_url_arr:
            if news_url.startswith("http") and "more" not in news_url:
                yield scrapy.Request(url=news_url, callback=self.parse)
            else:
                head = response.url.split("/")
                if news_url.startswith("../../"):
                    news_url = "/".join(head[:3]) + "/" + news_url.replace("../../", "")
                    yield scrapy.Request(url=news_url, callback=self.parse)
                elif news_url.startswith("../"):
                    news_url = "/".join(head[:4]) + "/" + news_url.replace("../", "")
                    yield scrapy.Request(url=news_url, callback=self.parse)
                elif news_url.startswith("./"):
                    news_url = response.url.split("index")[0] + news_url.replace("./", "")
                    yield scrapy.Request(url=news_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath("//div[@id='articleText']//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                public_time = (re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(
                    0) + ":00").replace(
                    "年", "-").replace("月", "-").replace("日", "")
            except:
                spiderUtil.log_level(8, response.url)

            source = "http://www.ce.cn/"
            try:
                title = response.xpath("//head/title/text()").extract()[0].split("_")[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author = response.xpath("//head/meta[@name='author']/@content").extract()[0]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                if public_time.startswith(spiderUtil.get_first_hour()) and content != "":
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #12
0
class bjNews(scrapy.Spider):
    name = "ckxxNewsSpider"
    start_url = "http://www.cankaoxiaoxi.com/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://china.cankaoxiaoxi.com/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://world.cankaoxiaoxi.com/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://mil.cankaoxiaoxi.com/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://finance.cankaoxiaoxi.com//",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://culture.cankaoxiaoxi.com/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://science.cankaoxiaoxi.com/",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """//*[@id="allList"]/div/div/div/div/p/a/@href""").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 2))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//*[@id="pubtime_baidu"]/text()""").extract()
                public_time = str(content_time[0])

            except:
                spiderUtil.log_level(8, response.url)

            try:
                contents = response.xpath(
                    """//*[@id="allList"]/div/div/div/p/text()""").extract()
                content = "".join(contents)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.cankaoxiaoxi.com/"

            try:
                author = str(
                    response.xpath("""//*[@id="source_baidu"]/text()""").
                    extract()[0].strip()).replace("来源:", "")

            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("//div/div/h1/text()").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #13
0
class sohuNews(scrapy.Spider):
    name = "xinhuaNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={}&cnt={}&tp=1&orderby=1'
        num = 30000
        pgnum = 1
        while num / 200 > 0:
            cnt = (num - 1) % 200 + 1
            url = str(url).format(pgnum, cnt)
            pgnum += 1
            num -= cnt
            yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header)

    def parse_item_home(self, response):
        data_str = response.text
        data_str = data_str[1:-1]
        data_str = eval(data_str, type('Dummy', (dict,), dict(__getitem__=lambda s, n: n))())
        data_str = json.dumps(data_str)
        data_str = json.loads(data_str)
        data_str = data_str['data']['list']
        for r in data_str:
            try:
                public_time = datetime.strptime(r['PubTime'], '%Y-%m-%d %H:%M:%S')
            except:
                spiderUtil.log_level(8, response.url)
            try:
                author = str(r['Author'])
            except:
                spiderUtil.log_level(9, response.url)
            try:
                title = str(r['Title'])
            except:
                spiderUtil.log_level(6, response.url)
            r_url = r['LinkUrl']
            public_time = public_time
            title = title
            author = author
            yield scrapy.Request(url=r_url, callback=self.parse, headers=self.header, meta={"public_time":public_time,"title":title, "author":author})

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath("""//div[contains(@id,'detail')]//p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xinhuanet.com/"

            try:
                if content != "" and str(response.meta["public_time"]).startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #14
0
class cyolOld(scrapy.Spider):
    name = "cyolSpider"
    ids = {
        767, 548, 777, 885, 895, 886, 785, 797, 803, 804, 811, 812, 814, 818,
        826, 834, 837, 842, 550, 849
    }
    header = spiderUtil.header_util()

    def start_requests(self):
        tmp = "https://zqbapp.cyol.com/zqzxapi/api.php?s=/Web/getNewsListCache/version/3.0.8/tid/%s/page/%s"
        for id in self.ids:
            for page in range(1, 2):
                url = tmp % (id, page)
                yield scrapy.Request(url=url,
                                     callback=self.parse_item_news_list,
                                     headers=self.header)

    def parse_item_news_list(self, response):
        p = re.compile(r'[(](.*)[)]', re.S)
        r = re.findall(p, response.body.decode('utf-8'))[0]
        json_loads = json.loads(r)
        data = json_loads["data"]
        for i in data:
            newsurl = i['newsurl']
            yield scrapy.Request(url=newsurl, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath(
                    "//div[@class='section-main']/p/text()").extract()
                content = "".join(content_arr)
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                public_time = re.search(r"(\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                                        response.text).group(0) + ":00"
                public_time = response.url.split(
                    "/")[-3][:-2] + "-" + public_time
            except:
                # spiderUtil.log_level(8, response.url)
                pass

            try:
                title_arr = response.xpath("//head/title//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath(
                    "//span[@id='copyfrom']//text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "中青在线"
            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.cyol.com/"

            try:
                if content != "" and len(
                        content) >= 100 and public_time.startswith(
                            spiderUtil.get_first_hour()):

                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
class sohuNews(scrapy.Spider):
    name = "xuexiqiangguoNewsSpider"
    header = spiderUtil.header_util()
    item = NewsAllItem()

    def start_requests(self):
        home_url = 'https://www.xuexi.cn/lgdata/1jscb6pu1n2.json?_st=26044379'
        yield scrapy.Request(url=home_url,
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        data_str = response.text
        data_str = data_str[1:-1]
        data_str = eval(
            data_str,
            type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
        data_str = json.dumps(data_str)
        data_str = json.loads(data_str)
        for r in data_str:
            try:
                public_time = str(r['publishTime'])
            except:
                spiderUtil.log_level(8, response.url)
            try:
                author = str(r['source'])
            except:
                spiderUtil.log_level(9, response.url)
            try:
                title = str(r['title'])
            except:
                spiderUtil.log_level(6, response.url)
            try:
                r_url1 = str(r_url).split("id=")[-1]
                r_url = "https://boot-source.xuexi.cn/data/app/" + r_url1 + ".js?callback=callback"
                yield scrapy.Request(url=r_url,
                                     callback=self.parse,
                                     headers=self.header,
                                     meta={
                                         "public_time": public_time,
                                         "title": title,
                                         "author": author
                                     })
            except:
                pass

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            data_str = response.text
            data_str = data_str[9:-1]
            data_str = eval(
                data_str,
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
            data_str = json.dumps(data_str)
            data_str = json.loads(data_str)
            try:
                content = str(data_str['normalized_content'])
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.xuexi.cn/"

            try:
                if content != "" and str(
                        response.meta["public_time"]).startswith(
                            spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #16
0
class qqNews(scrapy.Spider):
    name = "qqNewsSpider"
    taglist = [
        'ent', 'sports', 'finance', 'tech', 'news', 'sports_nba', 'fashion'
    ]
    header = spiderUtil.header_util()

    def start_requests(self):
        for tag in self.taglist:
            url = 'https://pacaio.match.qq.com/openapi/json?key=' + tag + ':' + spiderUtil.get_time(
            )[:-9].replace("-", "") + '&num=50'
            yield scrapy.Request(url=url,
                                 callback=self.parsepage,
                                 headers=self.header)

    def parsepage(self, response):
        newsjson = json.loads(response.text)
        newslist = newsjson['data']
        for news in newslist:
            url = news['url']
            publice_time = news['publish_time']
            author = news['source']
            title = news['title']
            meta = {
                'title': title,
                'public_time': publice_time,
                'author': author
            }
            yield scrapy.Request(url, callback=self.parsebody, meta=meta)

    def parsebody(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath(
                    "//div[@class='content-article']/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "https://news.qq.com/"

            try:
                if content != "" and str(
                        response.meta["public_time"]).startswith(
                            spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = response.meta["public_time"]
                    item["url"] = response.url
                    item["title"] = response.meta["title"]
                    item["author"] = response.meta["author"]
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #17
0
class bjNews(scrapy.Spider):
    name = "71cnNewsSpider"
    start_url = "http://www.71.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.71.cn/acastudies/bjyw/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/economy/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/politics/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/culture/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/community/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/ecology/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/dangjian/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(url="http://www.71.cn/acastudies/expcolumn/law/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.71.cn/acastudies/expcolumn/keji/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/jiaoyu/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/nationaldefense/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/international/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(
            url="http://www.71.cn/acastudies/expcolumn/history/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(url="http://www.71.cn/acastudies/impremarks/",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """/html/body/div/div/div/div/ul/li/div/a/@href""").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 2))
            yield scrapy.Request(url=detail_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//*[@id="main"]/div/div/div/div/div/span[1]/text()"""
                ).extract()
                public_time = str(str(content_time[0]) + ":00")

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """//*[@id="article-content"]/p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.71.cn/"

            try:
                author_arr = response.xpath(
                    """//*[@id="main"]/div/div/div/div/div/span[2]/text()"""
                ).extract()
                author = "".join(author_arr)
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath(
                    """//*[@id="main"]/div/div/div/div/h1/text()""").extract(
                    )[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    # if content != "" :
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    print(content, public_time, title, author)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #18
0
class cctvNews(scrapy.Spider):
    name = "cctvNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url="http://news.cctv.com/?spm=C96370.PsikHJQ1ICOX.Eu7sfGTzJUS0.1",
            callback=self.parse_item_home,
            headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """//a[starts-with(@href,'http://news.cctv.com/20')]/@href"""
        ).extract()
        for detail_url in detail_urls:
            if len(detail_url) < 70:
                time.sleep(random.uniform(1, 2))
                yield scrapy.Request(url=detail_url,
                                     callback=self.parse,
                                     headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("年", "-").replace(
                        "月", "-").replace("日", "") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """/html/body/div/div/div/p/text()""").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.southcn.com/"

            try:
                author_arr = response.xpath(
                    """/html/body/div/div/div/div/span/i/a/text()""").extract(
                    )
                author = "".join(author_arr).strip()
                if author == "":
                    author = "央视网"
                else:
                    author = author.replace("来源:", "")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    """/html/body/div/div/div/h1/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #19
0
class cniiNews(scrapy.Spider):
    name = "cniiSpider"
    start_url = ["http://www.cnii.com.cn/node_33989.htm",
                 "http://www.cnii.com.cn/node_34000.htm",
                 "http://www.cnii.com.cn/telecom/node_34020.htm",
                 "http://www.cnii.com.cn/city/node_34051.htm"]
    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            yield scrapy.Request(url=url, callback=self.parse_item_list_news,headers=self.header)
            # for page in range(2, 4):
            #     yield scrapy.Request(url=url.replace(".htm", "_" + str(page) + ".htm"),
            #                          callback=self.parse_item_list_news)

    def parse_item_list_news(self, response):
        url_arr = response.xpath("//ul[@class='list2']/li/a/@href").extract()
        for url in url_arr:
            time.sleep(random.uniform(1, 3))
            yield scrapy.Request(url=response.url.split("node")[0] + url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0)
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                content_arr = response.xpath("//div[@class='conzw']//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//head/title//text()").extract()
                title = "".join(title_arr).split("_")[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                author_arr = response.xpath("//div[@class='conzz']//text()").extract()
                author = "".join(author_arr)
                if author == "":
                    author = "中国信息产业网"

            except:
                spiderUtil.log_level(9, response.url)

            source = "http://www.cnii.com.cn/"

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #20
0
class sohuNews(scrapy.Spider):
    name = "sohuNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        url_list = [
            15, 10, 9, 8, 17, 18, 19, 23, 24, 25, 26, 27, 28, 29, 30, 34, 38,
            39, 40, 41, 42, 43, 44, 45, 46, 47
        ]
        for i in url_list:
            url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=' + str(
                i) + '&page=1&size=80'
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        url = response.url
        request = urllib.request.Request(url)
        data_str = urllib.request.urlopen(request, timeout=10).read()
        data_str = data_str.decode('utf-8')
        data_str = data_str[1:-1]
        data_str = eval(
            data_str,
            type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
        data_str = json.dumps(data_str)
        data_str = json.loads(data_str)
        for r in data_str:
            try:
                public_time_rt = datetime.fromtimestamp(r['publicTime'] //
                                                        1000)
                public_time = datetime.strftime(public_time_rt,
                                                '%Y-%m-%d %H:%M:%S')
            except:
                spiderUtil.log_level(8, response.url)
            try:
                author = str(r['authorName'])
            except:
                spiderUtil.log_level(9, response.url)
            try:
                title = str(r['title'])
            except:
                spiderUtil.log_level(6, response.url)
            url = 'http://www.sohu.com/a/' + str(r['id']) + '_' + str(
                r['authorId'])

            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 headers=self.header,
                                 meta={
                                     'public_time': public_time,
                                     'url': url,
                                     'title': title,
                                     'author': author
                                 })

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_arr = response.xpath(
                    """//article//p//text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://news.sohu.com/"

            try:
                if content != "" and str(
                        response.mete['public_time']).startswith(
                            spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["public_time"] = response.mete['public_time']
                    item["url"] = response.mete['url']
                    item["title"] = response.mete['title']
                    item["author"] = response.mete['author']
                    item["source"] = source
                    item["content"] = content
                    item["html_size"] = html_size
                    item["crawl_time"] = spiderUtil.get_time()
                    # print(self.item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #21
0
class xuanjiangjiaNews(scrapy.Spider):
    name = "188cfwNewsSpider"
    start_url = "http://www.188cf.net/"
    header = spiderUtil.header_util()

    def start_requests(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
            'Accepy': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Host': 'www.188cf.net',
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'http://www.188cf.net/'
        }
        Cookie = {
            'security_session_verify': '87456e2f5288fc9e54c5508487c724ae',
            'security_session_mid_verify': '2a568bbe22a10d992a688d95dc309586'
        }

        yield scrapy.Request(url="http://www.188cf.net/gegu/", callback=self.parse_item_home, headers=headers, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/gupiao/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/licai/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/jijin/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/qihuo/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/huangjin/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/waihui/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/zhaiquan/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/caijing/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/yinhang/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)
        # yield scrapy.Request(url="http://www.188cf.net/xueyuan/", callback=self.parse_item_home,headers=self.header, cookies=Cookie)

    def parse_item_home(self, response):
        detail_urls = response.xpath("""//div[@class="bt"]//@href""").extract()
        for detail_url in detail_urls:
            print(detail_url)
            time.sleep(random.uniform(1, 2))
            yield scrapy.Request(url=detail_url, callback=self.parse_item_home, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath("""//div[@class="info"]//text()""").extract()
                public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", content_time[0]).group(0)
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arrs = response.xpath("""//td[@class="content"]//p//text()""").extract()
                content_arr = content_arrs.split('推荐信息')[0]
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.188cf.net/"

            try:
                author = "188财富网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title = response.xpath("""//h1//text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #22
0
class xuanjiangjiaNews(scrapy.Spider):
    name = "rednetNewsSpider"
    start_url = "http://www.rednet.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="https://jishou.rednet.cn/channel/7250.html",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """//*[@id="div_newsList"]/ul/li/a/@href""").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 2))
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    "//main/section/section/div/span[4]/text()").extract()
                public_time = str(str(content_time[0]) + ":00")

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "/html/body/main/section/section/article/section/p/text()"
                ).extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.rednet.cn/"

            try:
                author_arr = response.xpath(
                    "//main/section/section/div/span[1]/text()").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "红网"
                else:
                    author = author.split("来源:")[1]
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath(
                    "//main/section/section/h1/text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                # if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                item = NewsAllItem()
                item["source"] = source
                item["content"] = content
                item["public_time"] = public_time
                item["url"] = response.url
                item["title"] = title
                item["author"] = author
                item["crawl_time"] = spiderUtil.get_time()
                item["html_size"] = html_size
                print(item)
                # yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #23
0
class ceNews(scrapy.Spider):
    name = "chinadailySpider"
    start_url = [
        "http://china.chinadaily.com.cn/5bd5639ca3101a87ca8ff636",
        "http://china.chinadaily.com.cn/5bd5639ca3101a87ca8ff62e",
        "http://world.chinadaily.com.cn/5bd55927a3101a87ca8ff610",
        "http://world.chinadaily.com.cn/5bda6641a3101a87ca904fe6",
        "http://caijing.chinadaily.com.cn/finance/",
        "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48a",
        "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48c",
        "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf48b",
        "http://cn.chinadaily.com.cn/lvyou/5b7628c6a310030f813cf493",
        "http://cn.chinadaily.com.cn/lvyou/5bac7d20a3101a87ca8ff52d",
        "http://fashion.chinadaily.com.cn/5b762404a310030f813cf467",
        "http://cn.chinadaily.com.cn/jiankang",
        "http://fashion.chinadaily.com.cn/5b762404a310030f813cf461",
        "http://fashion.chinadaily.com.cn/5b762404a310030f813cf462",
        "http://fashion.chinadaily.com.cn/5b762404a310030f813cf463",
        "http://fashion.chinadaily.com.cn/5b8f77a7a310030f813ed4c8"
    ]
    header = spiderUtil.header_util()

    def start_requests(self):
        for url in self.start_url:
            yield scrapy.Request(url=url,
                                 callback=self.parse_item_home,
                                 headers=self.header)

    def parse_item_home(self, response):
        for page in range(1, 2):
            if response.url.endswith("/"):
                list_url = response.url + "page_" + str(page) + ".html"
                yield scrapy.Request(url=list_url,
                                     callback=self.parse_item_list_news)
            else:
                list_url = response.url + "/page_" + str(page) + ".html"
                yield scrapy.Request(url=list_url,
                                     callback=self.parse_item_list_news)

    def parse_item_list_news(self, response):
        url_arr = response.xpath("//h3/a/@href").extract()
        for url in url_arr:
            url = "http:" + url
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)
            try:
                content_arr = response.xpath(
                    "//div[@id='Content']//text()").extract()
                content = "".join(content_arr).strip()
                # content = "".join(content_tmp.split())
            except:
                spiderUtil.log_level(7, response.url)

            try:
                title_arr = response.xpath("//head/title//text()").extract()
                title = "".join(title_arr).strip().strip()[:-8]
            except:
                spiderUtil.log_level(6, response.url)
            try:
                author = response.xpath(
                    "//head/meta[@name='author']/@content").extract()
                if author == []:
                    author = "中国日报网"
                else:
                    author = author[0]
            except:
                spiderUtil.log_level(9, response.url)
            source = "http://www.chinadaily.com.cn/"

            try:
                public_time = re.search(
                    r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",
                    response.text).group(0) + ":00"
            except:
                # spiderUtil.log_level(8, response.url)
                pass
            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # 数据打入piplines处理
                    # print(item)
                    yield item
            except:
                pass

        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #24
0
class peopleNews(scrapy.Spider):
    name = "peopleNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        now = int(time.time())
        url = 'http://news.people.com.cn/210801/211150/index.js?_=' + str(now)
        yield scrapy.Request(url,
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        newsjson = json.loads(response.text)
        newslist = newsjson['items']
        for news in newslist:
            post_title = news['title']
            post_url = str(news['url'])
            yield scrapy.Request(url=post_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = re.search(
                    r"(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{1,2})",
                    response.text).group(0).replace("年", "-").replace(
                        "月", "-").replace("日", " ") + ":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    "//div[@id='rwb_zw']/p//text()").extract()
                content = "".join(content_arr).strip()
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.people.com.cn/"

            try:
                author_arr = response.xpath(
                    "//div[@class='box01']/div[@class='fl']/a//text()"
                ).extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "人民网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("//h1//text()").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #25
0
class xuanjiangjiaNews(scrapy.Spider):
    name = "zgswxwwNewsSpider"
    start_url = "http://www.comnews.cn/"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.comnews.cn/article/pnews/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/photo/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(
            url="http://www.comnews.cn/article/international/",
            callback=self.parse_item_home,
            headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/local/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/dzone/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/abing/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/ibdnews/",
                             callback=self.parse_item_home,
                             headers=self.header)
        yield scrapy.Request(url="http://www.comnews.cn/article/home/",
                             callback=self.parse_item_home,
                             headers=self.header)

    def parse_item_home(self, response):
        detail_urls = response.xpath(
            """//ul[@class="col-ls alist"]//@href""").extract()
        for detail_url in detail_urls:
            # time.sleep(random.uniform(1, 2))
            detail_url = "http://www.comnews.cn" + detail_url
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse,
                                 headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                content_time = response.xpath(
                    """//meta[@name="PubDate"]//@content""").extract()
                # print(content_time)
                # content_times = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", content_time[0]).group(0)
                # print(content_times)
                public_time = str(content_time[0])

            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath(
                    """//div[@class="content"]//p//text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.comnews.cn/"

            try:
                author = response.xpath(
                    """//meta[@name="ContentSource"]//@content""").extract(
                    )[0].strip()
            except:
                spiderUtil.log_level(9, response.url)

            try:
                titles = response.xpath(
                    """//meta[@name="ArticleTitle"]//@content""").extract()
                title = "".join(titles)
            except:
                spiderUtil.log_level(6, response.url)

            try:
                # if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()):
                item = NewsAllItem()
                item["source"] = source
                item["content"] = content
                item["public_time"] = public_time
                item["url"] = response.url
                item["title"] = title
                item["author"] = author
                item["crawl_time"] = spiderUtil.get_time()
                item["html_size"] = html_size
                print(item)
                # yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #26
0
class southnetNews(scrapy.Spider):
    name = "southnetNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(url="http://www.southcn.com/pc2018/yw/node_384370.htm", callback=self.parse_item_home, headers=self.header)   # 要闻新闻
        yield scrapy.Request(url="http://news.southcn.com/gd/", callback=self.parse_item_home, headers=self.header)  # 广东新闻
        yield scrapy.Request(url="http://news.southcn.com/china/default.htm", callback=self.parse_item_home, headers=self.header)  # 中国新闻
        yield scrapy.Request(url="http://news.southcn.com/international/default.htm", callback=self.parse_item_home, headers=self.header)  # 国际新闻
        yield scrapy.Request(url="http://news.southcn.com/community/", callback=self.parse_item_home, headers=self.header)  # 社会新闻
        yield scrapy.Request(url="http://kb.southcn.com/default.htm", callback=self.parse_item_home, headers=self.header)  # 南方快报
        yield scrapy.Request(url="http://news.southcn.com/g/node_74681.htm", callback=self.parse_item_home, headers=self.header)  # 权威公告

    def parse_item_home(self, response):
        detail_urls = response.xpath("""//div/div/div/div/div/h3/a/@href""").extract()
        for detail_url in detail_urls:
            time.sleep(random.uniform(1, 2))
            yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                public_time = str(response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()[0].strip())+":00"
            except:
                spiderUtil.log_level(8, response.url)

            try:
                content_arr = response.xpath("""//*[@id="content"]/p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "http://www.southcn.com/"

            try:
                author_arr = response.xpath("""//*[@id="source_baidu"]/text()""").extract()
                author = "".join(author_arr).strip()
                if author == "":
                    author = "澎湃新闻"
                else:
                    author = author.replace("来源:","")
            except:
                spiderUtil.log_level(9, response.url)

            try:
                title_arr = response.xpath("""//*[@id="article_title"]/text()""").extract()
                title = "".join(title_arr).strip()
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(spiderUtil.get_first_hour()):
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)
Exemple #27
0
class sinaNews(scrapy.Spider):
    name = "sinaNewsSpider"
    header = spiderUtil.header_util()

    def start_requests(self):
        yield scrapy.Request(
            url=
            "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page=1",
            callback=self.parse_item_home,
            headers=self.header)

    def parse_item_home(self, response):
        response = requests.get(response.url, headers=self.header)
        data = response.json()
        datas = data['result']['data']
        for dict_data in datas:
            yield scrapy.Request(url=dict_data['url'], callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            text = response.text
            html_size = sys.getsizeof(text)

            try:
                if response.xpath(
                        """//*[@id="top_bar"]/div/div/span[1]/text()"""
                ).extract():
                    content_time = response.xpath(
                        """//*[@id="top_bar"]/div/div/span[1]/text()"""
                    ).extract()
                    public_datetime = str(content_time[0]).replace(
                        "年", "-").replace("月", "-").replace("日", "")
                    public_time = public_datetime + ":00"
                else:
                    if response.xpath(
                            """//*[@id="top_bar"]/div/div[2]/span/text()"""
                    ).extract():
                        content_time = response.xpath(
                            """//*[@id="top_bar"]/div/div[2]/span/text()"""
                        ).extract()
                        public_datetime = str(content_time[0]).replace(
                            "年", "-").replace("月", "-").replace("日", "")
                        public_time = public_datetime + ":00"
                    else:
                        content_time = response.xpath(
                            """//*[@id="pub_date"]/text()""").extract()
                        public_datetime = str(content_time[0]).replace(
                            "年", "-").replace("月", "-").replace("日", " ")
                        public_time = public_datetime + ":00"

            except:
                spiderUtil.log_level(8, response.url)

            try:
                if response.xpath(
                        """//*[@id="artibody"]/p/text()""").extract():
                    content_arr = response.xpath(
                        """//*[@id="artibody"]/p/text()""").extract()
                else:
                    content_arr = response.xpath(
                        """//*[@id="article"]/p/text()""").extract()
                content = "".join(content_arr)
            except:
                spiderUtil.log_level(7, response.url)

            source = "https://news.sina.com.cn/"

            try:
                if response.xpath("""//span[@class="source"]""").extract()[0]:
                    author = response.xpath(
                        """//span[@class="source"]""").extract()[0].strip()
                else:
                    author = "新浪网"
            except:
                spiderUtil.log_level(9, response.url)

            try:
                if response.xpath("""/html/body/div/h1/text()""").extract()[0]:
                    title = response.xpath(
                        """/html/body/div/h1/text()""").extract()[0]
                else:
                    title = response.xpath(
                        """//*[@id="artibodyTitle"]/text()""").extract()[0]
            except:
                spiderUtil.log_level(6, response.url)

            try:
                if content != "" and public_time.startswith(
                        spiderUtil.get_first_hour()):
                    # if content != "":
                    item = NewsAllItem()
                    item["source"] = source
                    item["content"] = content
                    item["public_time"] = public_time
                    item["url"] = response.url
                    item["title"] = title
                    item["author"] = author
                    item["crawl_time"] = spiderUtil.get_time()
                    item["html_size"] = html_size
                    # print(item)
                    yield item
            except:
                pass
        else:
            spiderUtil.log_level(response.status, response.url)