コード例 #1
0
ファイル: cnblogs_spider.py プロジェクト: oujx28/Spider_study
    def parse(self, response):
        papers = response.xpath(".//div[@class='day']")

        from scrapy.shell import inspect_response
        inspect_response(response, self)

        for paper in papers:
            url = paper.xpath(
                ".//div[@class='postTitle']/a/@href").extract_first()
            title = paper.xpath(
                ".//div[@class='postTitle']/a/text()").extract_first()
            time = paper.xpath(
                ".//div[@class='dayTitle']/a/text()").extract_first()
            content = paper.xpath(
                ".//div[@class='postCon']/div/text()").extract_first()
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)
            request = scrapy.Request(url=url, callback=self.parse_body)
            request.meta['item'] = item
            yield request

        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #2
0
    def parse(self, response):
        #实现网页的解析

        #首先抽取所有的文章
        papers = response.xpath(".//*[@class='day']")
        for paper in papers:
            url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
            title = paper.xpath(
                ".//*[@class='postTitle']/a/text()").extract()[0]
            time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
            content = paper.xpath(
                ".//*[@class='postTitle']/a/text()").extract()[0]
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)
            request = scrapy.Request(url=url, callback=self.pare_body)
            request.meta['item'] = item  #将item暂存
            # print (item)
            yield request
        #代码最后使用yield关键字提交item,讲parse方法打造成一个生成器
        next_page = Selector(response).re(
            u'<a href="(\S*)">下一页</a>')  #会返回下一页的连接
        #next_page=Selector(response).xpath("//div[@id='nav_next_page']/a/@href").extract()
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #3
0
    def parse(self, response):
        # 实现网页的解析
        # 首先抽取所有的文章
        papers = response.xpath(".//*[@class='day']")
        # 从每篇文章中抽取数据

        for paper in papers:
            url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
            title = paper.xpath(
                ".//*[@class='postTitle']/a/text()").extract()[0]
            time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
            content = paper.xpath(
                ".//*[@class='postCon']/div/text()").extract()[0]
            # print url,title,time,content
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)
            #request = scrapy.Request(url=url,callback=self.parse_body())
            #request.meta['item'] = item # 将item暂存
            yield item
        # 翻页功能
        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #4
0
ファイル: cnblog.py プロジェクト: yunwangjun/python_code
    def parse(self, response):
	    # print(response.body)
	    # filename = "cnblog.html"
	    # with open(filename, 'w') as f:
	    #     f.write(response.body)

	    #存放博客的集合
	    items = []

	    for each in response.xpath(".//*[@class='day']"):
	    	item = CnblogspiderItem()
	    	url = each.xpath('.//*[@class="postTitle"]/a/@href').extract()[0]
	    	title = each.xpath('.//*[@class="postTitle"]/a/text()').extract()[0]
	    	time = each.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0]
	    	content = each.xpath('.//*[@class="postCon"]/div/text()').extract()[0]

	    	item['url'] = url
	    	item['title'] = title
	    	item['time'] = time
	    	print(content)
	    	item['content'] = content
	    	
	    	yield item

	    next_page = response.selector.re(u'<a href="(\S*)">下一页</a>')
	    if next_page:
	    	yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #5
0
    def parse(self, response):
        # 实现网络的解析
        # 首先抽取所有的文章
        papers = response.xpath(".//*[@class='day']")
        for paper in papers:
            url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]

            title = paper.xpath(
                ".//*[@class='postTitle']/a/text()").extract()[0]
            time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
            content = paper.xpath(
                ".//*[@class='c_b_p_desc']/text()").extract()[0]
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)
            request = scrapy.Request(url=url, callback=self.parse_body)
            request.meta['item'] = item
            # yield item
            yield request
            # print url,title,time
        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        print next_page
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #6
0
    def parse(self, response):
        #首先抽取每篇文章的一大块,这种抽取方法较好,不像那种把每个抽取对象单独抽取成一个列表之后再整理

        papers = response.xpath('//div[@class="day"]')
        for paper in papers:
            url = paper.xpath('div[@class="postTitle"]/a/@href').extract()[0]
            title = paper.xpath(
                'div[@class="postTitle"]/a/text()').extract()[0]
            time = paper.xpath('div[@class="dayTitle"]/a/text()').extract()[0]
            content = paper.xpath(
                'div[@class="postCon"]/div/text()').extract()[0]
            # print(url,title,time,content)
            #item的另一种构造方法
            item = CnblogspiderItem(
                url=url, title=title, time=time,
                content=content)  #也可以item=CnblogspiderItem(),再按照字典键的方法添加
            yield item
        # print(response.text.find('下一页'))
        #re.S不是必要的 使
        # *是匹配一次或者多次 用“a”去匹配"  aab"得到的是'' 因为匹配到的空格也是满足条件的  很少这样用吧
        #\s 匹配空字符
        #\S 匹配非空字符
        #可以用scrapy shell url 单独测试xpath等匹配是否正确
        next_page = re.search('<a href="(\S*?)">\s*下一页\s*</a>', response.text,
                              re.S).group(1)

        # print('='*80)
        #会自动爬取后面的url
        #判定如果还有页面需要爬取
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.parse)
コード例 #7
0
    def parse(self, response):
        inner_link = None
        print("start_urls", self.start_urls)
        #  all_urls = set()
        urls = set()
        all_urls = set()
        #  print(self.start_urls[0])
        all_link = response.xpath(
            "//*//@src | //*//@href | //*//@url |  //*//@ocde").extract(
            )  #页面内的所有链接
        #item = CnblogspiderItem()

        for link in all_link:
            all_urls.add(link)
            item = CnblogspiderItem()
            item['start_link'] = self.start_urls[0]
            item['hash_start_link'] = hashlib.md5(
                b'self.start_urls[0]').hexdigest()

            if not link.startswith('http'):
                inner_link = link
                print("111: " + str(inner_link) + "==>from: " + str(response))
                link = urljoin(self.start_urls[0], link)

            if urlparse(link).netloc != urlparse(self.start_urls[0]).netloc:
                print("外链" + link + "==>from: " + str(response))
                out_link = link
                inner_link = str()
                item['link'] = link + str(response)
                # item['from_link'] = str(response)
                yield item
            else:
                if inner_link:
                    item['link'] = inner_link + "内链" + str(response)
                else:
                    item['link'] = link + "内链" + str(response)

                urls.add(link)
                print("内链" + link + "==>from: " + str(response))
                #item['from_link'] = str(response)
            self.layer += 1
            print("layer", str(self.layer))

            yield item
        print("all_urls:" + str(all_urls))
        new_urls = urls - self.ALL_urls
        urls_list = list(new_urls)
        self.ALL_urls = self.ALL_urls.union(urls)

        #print("内链:" + urls_list)
        for url in urls_list:
            print("0000:" + url)
            url = response.urljoin(url)
            yield scrapy.Request(url=url, callback=self.parse)
コード例 #8
0
    def parse(self, response):
        pageList = response.xpath('//div[@class="day"]')

        for page in pageList:
            item = CnblogspiderItem()
            item["url"] = page.xpath('./div[2]/a/@href').extract()[0]
            item["title"] = page.xpath('./div[2]/a/text()').extract()[0]
            item["time"] = page.xpath('./div[1]/a/text()').extract()[0]
            item["content"] = page.xpath('./div[3]/div/text()').extract()[0]
            yield item

        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')[0]
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.parse)
コード例 #9
0
ファイル: cnblogs.py プロジェクト: lyuxiaoyu/LearningSpider
    def parse(self, response):
        divDayList = response.xpath(r"//div[@class='day']")
        for divDay in divDayList:
            date = divDay.xpath(
                r"./div[@class='dayTitle'][1]/a/text()").extract()[0]
            title = divDay.xpath(
                r"./div[@class='postTitle'][1]/a/text()").extract()[0]
            url = divDay.xpath(
                r"./div[@class='postTitle'][1]/a/@href").extract()[0]
            item = CnblogspiderItem(date=date, title=title, url=url)
            yield item

        next_page = response.xpath(r"//a[text()='下一页'][1]/@href").extract()
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #10
0
    def parse(self, response):
        # 实现网页的解析
        # 首先抽取所有的文章,response中选择所有class='day'对应的节点元素
        # 分析原文可以发现,每一篇文章都是放在,<div class="day"> </div>标签里面
        # 选取的结果就是一个列表,就是多篇文章
        papers = response.xpath(".//*[@class='day']")

        # 加入调试查看命令,可以分布查看每一次爬行的结果
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)

        # 从每篇文章中抽取数据
        for paper in papers:
            # 提取出一篇文章的地址,标题,时间,和内容(就是标题)
            # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下href属性的第一个值
            # .extract()[0]提取文字
            url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
            # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下的第一个文字内容
            title = paper.xpath(
                ".//*[@class='postTitle']/a/text()").extract()[0]
            time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
            # 提取文章的摘要
            content = paper.xpath(
                ".//*[@class='c_b_p_desc']/text()").extract()[0]

            # 将提取到的数据封装成一个Item对象,封装之后类似一个字典,url为键,提取到的url地址就是值
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)

            # 上面已经提取到了具体的一篇文章的地址,然后通过改地址,创建一个新的请求
            # 通过Request类请求,传入上面解析到的文章的URL,传入一个回调方法进行网页的解析
            # Request中有个meta参数,用来传递信息,传递信息的格式必须是一个字典类型,
            # 通过Request传进去,通过Request的请求结果response取出来,取出方法与字典一样
            request = scrapy.Request(url=url,
                                     meta={'item': item},
                                     callback=self.parse_body)

            # 将parse打造成一个生成器,生成item,经过循环之后会生成很多item字典对象
            # 函数生成器,相当于return返回request请求对象的值,上面的request执行了parse_body方法,
            # 最终得到的就是一个item字典对象,实际返回的就是一个容器,里面存储着分析网页得到的各种数据
            yield request

        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            # url为请求的对象,callback为回调方法,指定由谁来解析请求的响应
            yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #11
0
    def parse(self,response):
        papers = response.xpath('.//*[@class="day"]')

        for paper in papers:
            url = paper.xpath('.//div[@class="postTitle"]/a/@href').extract()[0]
            title = paper.xpath('.//div[@class="postTitle"]/a/text()').extract()[0]
            time = paper.xpath('.//div[@class="dayTitle"]/a/text()').extract()[0]
            content = paper.xpath('.//div[@class="c_b_p_desc"]/text()').extract()[0]
            # print '\n'
            # print title,url,time,content
            item = CnblogspiderItem(url=url,title=title,time=time,content=content)
            yield item

        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            yield scrapy.Request(url=next_page[0],callback=self.parse)
コード例 #12
0
    def parse(self, response):
        #找到所有文章
        papers = response.xpath('//*[@class="day"]')
        #提取文章链接、标题和摘要
        for paper in papers:
            url = paper.xpath('.//*[@class="postTitle"]/a/@href').extract()[0]
            title = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0]
            summary = paper.xpath('.//*[@class="postCon"]/div/text()').extract()[0]
            item = CnblogspiderItem(url=url,title=title,summary=summary)
            request=scrapy.Request(url = url,callback=self.parse_body)
            request.meta['item'] = item #将item暂存
            yield item

        #找下一页超链
        next_page = Selector(response).re(u'<a href = "(\S*)"下一页</a>')
        if next_page:
            yield scrapy.Request(next_page,callback = self.parse)
コード例 #13
0
    def parse(self, response):
        # 实现网页的解析
        # 首先抽取所有的文章,response中选择所有class='day'对应的节点元素
        # 分析原文可以发现,每一篇文章都是放在,<div class="day"> </div>标签里面
        # 选取的结果就是一个列表,就是多篇文章
        papers = response.xpath(".//*[@class='day']")

        # 加入调试查看命令,可以分布查看每一次爬行的结果
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)

        # 从每篇文章中抽取数据
        for paper in papers:
            # 提取出一篇文章的地址,标题,时间,和内容(就是标题)
            # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下href属性的第一个值
            # 先返回的是一个selector列表,extract()返回的才是内容的列表,.extract()[0]提取列表中第一个内容
            url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
            # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下的第一个文字内容
            title = paper.xpath(".//*[@class='postTitle']/a/text()").extract()[0]
            time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
            # 提取文章的摘要
            content = paper.xpath(".//*[@class='c_b_p_desc']/text()").extract()[0]

            # 将提取到的数据封装成一个Item对象,封装之后类似一个字典,url为键,提取到的url地址就是值
            item = CnblogspiderItem(url=url, title=title, time=time, content=content)

            # 上面已经提取到了具体的一篇文章的地址,然后通过该地址,创建一个新的请求,调用第二层的解析方法parse_body
            # 通过Request类请求,传入上面解析到的文章的URL,传入一个回调方法进行网页的解析
            # Request中有个meta参数,用来传递信息,传递信息的格式必须是一个字典类型,
            # 通过Request传进去,通过Request的请求结果response取出来,取出方法与字典一样
            request = scrapy.Request(url=url, meta={'item': item}, callback=self.parse_body)
            # 上面request调用parse_body方法后放回的就是一个最终的item对象
            # 下面又执行yield request实际就是返回了item对象

            # 将parse打造成一个生成器,生成item,经过循环之后会生成很多item字典对象
            # 函数生成器,相当于return返回request请求对象的值,上面的request执行了parse_body方法,
            # 最终得到的就是一个item字典对象,实际返回的就是一个容器,里面存储着分析网页得到的各种数据
            yield request

        # 爬取下一页,先查找下一页的连接地址
        # 下一页源码格式:<a href="https://www.cnblogs.com/qiyeboy/default.html?page=2">下一页</a>
        next_page = re.findall(u'<a href="(\S*)">下一页</a>', response.content.decode())[0]  # \S 匹配任何非空白字符,*尽可能多的匹配
        if next_page:
            # url为请求的对象,callback为回调方法,指定由谁来解析请求的响应
            yield scrapy.Request(url=next_page, callback=self.parse)
コード例 #14
0
ファイル: cnblogSpiders.py プロジェクト: mr-kkid/scrapy
 def parse(self, response):
     days = response.xpath('.//*[@class="day"]')
     for day in days:
         postTitle = day.xpath(
             './/*[@class="postTitle"]/a/text()').extract()[0]
         time = day.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0]
         content = day.xpath(
             './/*[@class="postCon"]/div/text()').extract()[0]
         url = day.xpath('.//*[@class="postTitle"]/a/@href').extract()[0]
         items = CnblogspiderItem(url=url,
                                  postTitle=postTitle,
                                  time=time,
                                  content=content)
         print url, postTitle, time, content
         yield items
         next_paper = Selector(response).re(u'<a href="(\S*)">下一页</a>')
         if next_paper:
             yield scrapy.Request(url=next_paper[0], callback=self.parse)
コード例 #15
0
 def parse(self, response):
     all_articles = response.xpath(".//*[@class='day']")
     for article in all_articles:
         title = article.xpath(
             './/*[@class="postTitle"]/a/text()').extract()[0]
         abstract = article.xpath(
             './/*[@class="postCon"]/div/text()').extract()[0]
         href = article.xpath(
             './/*[@class="postTitle"]/a/@href').extract()[0]
         day = article.xpath(
             './/*[@class="dayTitle"]/a/text()').extract()[0]
         # print title,'\n',day,'\n',href,'\n',abstract,'\n'
         item = CnblogspiderItem(url=href,
                                 title=title,
                                 abstract=abstract,
                                 day=day)
         request = scrapy.Request(url=href, callback=self.pase_body)
         request.meta['item'] = item
         yield request
     next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
     if next_page:
         yield scrapy.Request(url=next_page[0], callback=self.parse)
コード例 #16
0
ファイル: cnblogs_spider.py プロジェクト: oujx28/Spider_study
    def parse_item(self, response):
        print(response)
        papers = response.xpath(".//div[@class='day']")

        #from scrapy.shell import inspect_response
        #inspect_response(response, self)

        for paper in papers:
            url = paper.xpath(
                ".//div[@class='postTitle']/a/@href").extract_first()
            title = paper.xpath(
                ".//div[@class='postTitle']/a/text()").extract_first()
            time = paper.xpath(
                ".//div[@class='dayTitle']/a/text()").extract_first()
            content = paper.xpath(
                ".//div[@class='postCon']/div/text()").extract_first()
            item = CnblogspiderItem(url=url,
                                    title=title,
                                    time=time,
                                    content=content)
            request = scrapy.Request(url=url, callback=self.parse_body)
            request.meta['item'] = item
            yield request