Ejemplo n.º 1
0
    def parse(self, response):
        # 获取所有图片的a标签
        allPics = response.xpath('//div[@class="img"]/a')
        for pic in allPics:
            # 提取每张图片信息
            item = PicItem()
            name = pic.xpath('./img/@alt').extract()[0]
            addr = pic.xpath('./img/@src').extract()[0]
            addr = 'http://www.xiaohuar.com' + addr
            item['name'] = name
            item['addr'] = addr
            # print (item)
            # 返回爬取数据
            yield item

        # 获取下一页
        # self.fetchNext(response)
        navPageList = response.xpath(
            '//div[@id="page"]/div[@class="page_num"]/a')
        # print(navPageList)
        for navPage in navPageList:
            txt = navPage.xpath('./text()').extract()[0]
            url = navPage.xpath('./@href').extract()[0]
            # print(txt)
            if txt == '下一页':
                print(url)
                yield scrapy.Request(url, callback=self.parse)
                break
Ejemplo n.º 2
0
 def parse(self, response):
     # 当页面以list开头才进行爬取
     if response.url.startswith('http://www.xiaohuar.com/list-'):
         # 获取所有图片的a标签
         allPics = response.xpath('//div[@class="img"]/a')
         for pic in allPics:
             # 分别处理每个图片,取出名称及地址
             item = PicItem()
             name = pic.xpath('./img/@alt').extract()[0]
             addr = pic.xpath('./img/@src').extract()[0]
             addr = 'http://www.xiaohuar.com' + addr
             item['name'] = name
             item['addr'] = addr
             # 返回爬取到的数据
             yield item
     # 获取所有的a标签的urls
     urls = response.xpath('//a/@href').extract()
     for url in urls:
         if url.startswith("http://www.xiaohuar.com/list-"):
             # url是否在url_set集合中
             if url not in XhSpider.url_set:
                 # 添加url到集合中
                 XhSpider.url_set.add(url)
                 # 抛出url
                 yield self.make_requests_from_url(url)
Ejemplo n.º 3
0
 def parse(self, response):
     # 如果图片地址以http://www.xiaohuar.com/list-开头,我才取其名字及地址信息
     if response.url.startswith("http://www.xiaohuar.com/list-"):
         allPics = response.xpath('//div[@class="img"]/a')
         for pic in allPics:
             # 分别处理每个图片,取出名称及地址
             item = PicItem()
             name = pic.xpath('./img/@alt').extract()[0]
             addr = pic.xpath('./img/@src').extract()[0]
             addr = 'http://www.xiaohuar.com' + addr
             item['name'] = name
             item['addr'] = addr
             # 返回爬取到的信息
             yield item
     # 获取所有的地址链接
     urls = response.xpath("//a/@href").extract()
     for url in urls:
         # 如果地址以http://www.xiaohuar.com/list-开头且不在集合中,则获取其信息
         if url.startswith("http://www.xiaohuar.com/list-"):
             if url in XhSpider.url_set:
                 pass
             else:
                 XhSpider.url_set.add(url)
                 # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数
                 # from scrapy.http import Request
                 # Request(url,callback=self.parse)
                 yield self.make_requests_from_url(url)
         else:
             pass
Ejemplo n.º 4
0
 def parse(self, response):
     pic_item = PicItem()
     img_nodes = response.css('div.topic-list')
     pic_item['href'] = img_nodes.css('a::attr(href)').extract()
     pic_item['src'] = img_nodes.css('img::attr(src)').extract()
     pic_item['title'] = img_nodes.css('span::text').extract()
     yield pic_item
Ejemplo n.º 5
0
 def parse(self, response):
     # if the image address start with http://www.xiaohuar.com/list-,I will get it's name and address
     if response.url.startswith("http://www.xiaohuar.com/list-"):
         allPics = response.xpath('//div[@class="img"]/a')
         for pic in allPics:
             # Process each image separately, take out its' name and address.
             item = PicItem()
             name = pic.xpath('./img/@alt').extract()[0]
             addr = pic.xpath('./img/@src').extract()[0]
             addr = 'http://www.xiaohuar.com'+addr
             item['name'] = name
             item['addr'] = addr
             # Return crawled data
             yield item
     # get all the address links 
     urls = response.xpath("//a/@href").extract()
     for url in urls:
         # if address start with http://www.xiaohuar.com/list-,and not in the collection, get the message
         if url.startswith("http://www.xiaohuar.com/list-"):
             if url in XhSpider.url_set:
                 pass
             else:
                 XhSpider.url_set.add(url)
                 # the default callback function is parse, you can assign it 
                 # by 'from scrapy.http import Request'
                 # from scrapy.http import Request
                 # Request(url,callback=self.parse)
                 yield self.make_requests_from_url(url)
         else:
             pass
Ejemplo n.º 6
0
    def parse(self, response):
        allpics = response.xpath('//div[@class="img"]/a')
        for pic in allpics:
            item = PicItem()
            name = pic.xpath('./img/@alt').extract()[0]
            addr = pic.xpath('./img/@src').extract()[0]
            if 'http' in addr:
                addr = addr
            else:
                addr = 'http://www.xiaohuar.com' + addr
            item['name'] = name
            item['addr'] = addr
            yield item

        # 获取所有的地址链接
        urls = response.xpath('//a/@href').extract()
        for url in urls:
            if url.startswith('http://www.xiaohuar.com/list-'):
                if url in XhSpider.url_set:
                    pass
                else:
                    XhSpider.url_set.add(url)
                    yield self.make_requests_from_url(url)
            else:
                pass
Ejemplo n.º 7
0
 def parse_detail(self, response):
     urls = response.xpath(
         '//div[@class="entry-content"]/p/img/@src').extract()
     for url in urls:
         url = [url]
         print(url)
         it1 = PicItem()
         it1['image_urls'] = url
         yield it1
Ejemplo n.º 8
0
 def parse(self, response):
     allPics = response.xpath("//div[@class='img']/a")
     for pic in allPics:
         item = PicItem()
         name = pic.xpath("./img/@alt").extract()[0]
         addr = pic.xpath("./img/@src").extract()[0]
         addr = "http://www.xiaohuar.com" + addr
         item["name"] = name
         item["addr"] = addr
         yield item
Ejemplo n.º 9
0
 def parse(self, response):
     s_re = re.compile(r'img class="lazy" src="(.*?)"')
     se = re.findall(s_re, response.text)
     item = PicItem()
     srcs = []
     for i in se:
         #item['title'] = i[0]
         srcs.append(i)
     item['src'] = se
     yield item
Ejemplo n.º 10
0
 def parse(self, response):
     # 获取所有图片的a标签
     allPics = response.xpath(' //*[@id="imgid"]/a')
     for pic in allPics:
         # 分别处理每个图片,取出名称及地址
         item = PicItem()
         addr = pic.xpath('./imgid/@href').extract()[0]
         addr = 'http://image.baidu.com' + addr
         item['addr'] = addr
         # 返回爬取到的数据
         yield item
Ejemplo n.º 11
0
 def parse(self, response):
     # 获取所有图片的a标签
     allPics = response.xpath('//td[@class="wordbook-wordlist-name"]/a')
     for pic in allPics:
         # 分别处理每个图片,取出名称及地址
         item = PicItem()
         #name = pic.xpath('./img/@alt').extract()[0]
         addr = pic.xpath('./@href').extract()[0]
         addr = 'http://www.xiaohuar.com' + addr
         #item['name'] = name
         item['addr'] = addr
         # 返回爬取到的数据
         yield item
Ejemplo n.º 12
0
 def parse(self, response):
     # 获取所有图像的a标签
     allPics = response.xpath('//div[@class="img"]/a')
     for pic in allPics:
         # 分别处理每个图片取出名称及地址
         item = PicItem()
         name = pic.xpath('./img/@alt').extract()[0]
         addr = pic.xpath('./img/@src').extract()[0]
         addr = 'http://www.xiaohuar.com' + addr
         item['name'] = name
         item['addr'] = addr
         # 返回爬取数据
         yield item
Ejemplo n.º 13
0
    def parse(self, response):
        into = response.xpath(
            '//div[@class = "post-module-thumb"]/a/@href').extract()

        for u in into:
            item1 = PicItem()
            item1['detailed'] = u
            yield scrapy.Request(
                item1['detailed'],
                callback=self.parse_detail,
            )
        ifnxt = response.xpath('//div[@class="btn-pager"]').extract()[0]
        if "empty button" in ifnxt:
            nxt = response.xpath(
                '//div[@class="btn-pager"]/a/@href').extract()[0]
            yield scrapy.Request(nxt, callback=self.parse)
Ejemplo n.º 14
0
    def parse(self, response):
        print("status:"+str(response.status))
        pics = response.xpath('//img[@class="img-fluid"]')
        #print("movies:"+str(movies))
        #print(pics)

        for pic in pics:
            item = PicItem()
            src  = pic.xpath('./@src').extract()
            name  = pic.xpath('./@alt').extract()
            
            if len(name) != 0:
                item['name'] = name[0]

            #print(name)
            #print(src[0])

            #item['name'] = name[0]
            item['src'] = src[0]
            yield item
            
Ejemplo n.º 15
0
 def parse(self, response):
     item =  PicItem()
     imgurls = response.css(".post img::attr(src)").extract()
     item['imgurl'] = imgurls  
     yield item
     pass
Ejemplo n.º 16
0
    def parse(self, response):

        if (False == XhSpider.url_list_db.query(response.url, 1)):

            XhSpider.url_list_db.update(response.url)

            #print ("2.get img ing ...... ",sys._getframe().f_lineno)

            if response.url.startswith("https://www.zbjuran.com/mei/"):
                allPics = response.xpath('//center/div[@class="picbox"]')

                #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                for pic in allPics:
                    # 分别处理每个图片,取出名称及地址
                    #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                    item = PicItem()
                    #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                    addr = ""
                    if len(pic.xpath('./img/@src')) >= 1:
                        addr = pic.xpath('./img/@src').extract()[0]
                    else:
                        addr = pic.xpath('./p/img/@src').extract()[0]

                    #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                    name_1 = response.xpath(
                        '//div[@class="title"]/h2/text()').extract()[0]
                    #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                    #print (name_1)
                    name_2 = addr.replace('/', '_').replace(':', '_')

                    name = ""

                    name_obj = pic.xpath('./img/@alt')
                    if len(name_obj) >= 1:
                        name = name_obj.extract()[0]

                    if len(name_1) >= 1:
                        item['name'] = name_1
                    else:
                        if len(name) >= 1:
                            item['name'] = name
                        else:
                            item['name'] = name_2

                    if addr.startswith('/'):
                        addr = "https://www.zbjuran.com/" + addr

                    item['addr'] = addr
                    # 返回爬取到的信息
                    #print ("hav:",item['addr'],item['name'])
                    #print ("2.get img ing ...... ",sys._getframe().f_lineno )
                    yield item
                print("2.get img ing ...... ", sys._getframe().f_lineno)
            # 获取所有的地址链接
            #print ("2.get href ing ...... ")
            urls = response.xpath("//a/@href").extract()

            for url in urls:
                #print(url)
                url_arr = url.split("_")
                # 如果地址以http://www.xiaohuar.com/list-开头且不在集合中,则获取其信息
                if url.startswith("/mei/") and url.endswith(".html"):

                    url_whole = "https://www.zbjuran.com" + url

                    if XhSpider.url_list_db.query(url_whole):
                        #print ("Exist:",url,url_whole)
                        pass

                    else:
                        #XhSpider.url_set.add(url_whole)
                        XhSpider.url_list_db.insert(url_whole)
                        # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数
                        # from scrapy.http import Request
                        # Request(url,callback=self.parse)
                        print("add", url_whole)
                        yield self.make_requests_from_url(url_whole)
                elif url_arr[0].isdigit():
                    u_arr = response.url.split("/")
                    u_arr.pop()
                    u_arr.append(url)

                    url_whole = "/"

                    url_whole = url_whole.join(u_arr)

                    if XhSpider.url_list_db.query(url_whole):
                        #print ("Exist:",url,url_whole)
                        pass

                    else:
                        print("add", url_whole)
                        #XhSpider.url_set.add(url_whole)
                        XhSpider.url_list_db.insert(url_whole)
                        # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数
                        # from scrapy.http import Request
                        # Request(url,callback=self.parse)
                        yield self.make_requests_from_url(url_whole)
                elif url.startswith("http") and url.endswith(".html"):

                    url_whole = url

                    if XhSpider.url_list_db.query(url_whole):
                        #print ("Exist:",url,url_whole)
                        pass
                    else:
                        #XhSpider.url_set.add(url_whole)
                        XhSpider.url_list_db.insert(url_whole)
                        # 回调函数默认为parse,也可以通过from scrapy.http import Request来指定回调函数
                        # from scrapy.http import Request
                        # Request(url,callback=self.parse)
                        print("add", url_whole)
                        yield self.make_requests_from_url(url_whole)
                else:

                    pass

        print("3.get waiting href ... ")

        url_whole = XhSpider.url_list_db.query_data()

        if len(url_whole) > 10:
            yield self.make_requests_from_url(url_whole)
        else:
            print("4. we finished this site")
Ejemplo n.º 17
0
 def parse_url(self, response):
     items = PicItem()
     pic_list = response.xpath("//img/@src").extract()
     for pic in pic_list:
         items['pic_url'] = pic
         yield items