Example #1
0
def parse_all_shops(response):
    # 获取所有图片的a标签
    shops = response.xpath('//div[@class="shopsearch"]/div[@class"content"]ul/li')
    logging.debug(shops)
    for shop in shops:
        logging.debug("=============================")
        logging.debug(shop.decode("gbk").encode("utf-8"))
        # 分别处理每个图片,取出名称及地址
        item = XiaohuaItem()
        # 店铺名
        name = shop.xpath('./div[@class="pic"]/a/img/@alt').extract()[0]
        # 店铺图片url
        picUrl = shop.xpath('./div[@class="pic"]/a/img/@src').extract()[0]
        # 店铺url
        url = shop.xpath('./div[@class="pic"]/a/@href').extract()[0]
        # 店铺地址
        address = shop.xpath('./div[@class="txt"]/div[@class="tag-addr"]/span/text()').extract()[0]
        item['name'] = name
        item['pic'] = picUrl
        item['url'] = url
        item['address'] = address
        logging.debug(item)
        logging.debug("=============================")
        # 返回爬取到的数据
        yield item
Example #2
0
 def pic_parse(self, response):
     item = XiaohuaItem()
     links = response.xpath('//div[@class="picbox"]/a/img/@src').extract()
     for link in links:
         #start_urls存放爬虫框架开始时的链接,该链接必须以列表形式存放,不能以字符串形式存放
         item['image_urls'] = [response.urljoin(link)]
         yield item
Example #3
0
    def parse(self, response):
        headers = {'User-Agent': UserAgent(verify_ssl=False).random}
        item = XiaohuaItem()
        if response.status == 200:
            contents = response.xpath(
                '//div[@class="content-left"]/div[@class="one-cont"]')
            for content in contents:
                item['nickname'] = self.join_list(
                    content.xpath('./div[1]/div/a/i/text()').extract())
                item['content'] = self.join_list(
                    content.xpath('./p[@class="fonts"]/a/text()').extract())
                item['support'] = int(
                    self.join_list(
                        content.xpath('./ul/li[1]/span/text()').extract()))
                item['not_support'] = int(
                    self.join_list(
                        content.xpath('./ul/li[2]/span/text()').extract()))
                item['collect'] = int(
                    self.join_list(
                        content.xpath('./ul/li[3]/span/text()').extract()))
                item['message'] = int(
                    self.join_list(
                        content.xpath('./ul/li[4]/a/span/text()').extract()))
                item['share'] = int(
                    self.join_list(
                        content.xpath('./ul/li[5]/span/text()').extract()))
                print(item)
                yield item

        if self.page < 5:
            self.page += 1
            yield scrapy.Request(self.base_url.format(self.page),
                                 headers=headers,
                                 callback=self.parse)
Example #4
0
 def parse(self, response):
     allPics = response.xpath('//div[@class="img"]/a')
     for pic in allPics:
         item = XiaohuaItem()
         name = pic.xpath('./img/@alt').extract()[0]
         src = pic.xpath('./img/@src').extract()[0]
         addr = 'http://www.xiaohuar.com' + src
         item['name'] = name
         item['addr'] = addr
         yield item
 def parse(self, response):
     item = XiaohuaItem()
     for i, book in enumerate(response.css('div.img')):
         item['image_urls'] = book.xpath('./a/img/@src').extract_first()
         item['name'] = book.xpath('./a/img/@alt').extract_first()
         yield item
     for i in range(2, 50):
         nextPage = "http://www.xiaohuar.com/list-1-" + str(i) + ".html"
         if nextPage:
             nextPage = response.urljoin(nextPage)
             yield scrapy.Request(nextPage, callback=self.parse)
Example #6
0
 def parse_three(self,response):
     item=XiaohuaItem()
     #传入上面的item2
     item3=response.meta['item2']
     #匹配正则获取图片真实地址detailURL
     pattern=re.compile(r'<li class="pic-down h-pic-down"><a target="_blank" class="down-btn" href=\'(.*?)\'>.*?</a>',re.S)
     URL=re.search(pattern,response.text).group(1)
     item['detailURL']=URL
     item['path']=item3['path']
     item['fileName']=item3['fileName']
     yield item
Example #7
0
    def parse_last(self, response):
        item = XiaohuaItem()
        item['alt'] = response.xpath(
            '//div[@id="big-pic"]//img/@alt').extract()[0].strip()
        image_links = response.xpath(
            '//div[@id="big-pic"]//img/@src').extract()
        item['src'] = []
        for src in image_links:
            if '.jpg' in src:
                item['src'].append(src)

        yield item
Example #8
0
 def parse_detail(self, response):
     title = response.meta['title']
     div_list = response.xpath('//*[@id="swiper1"]/div/div')
     num = 1
     for div in div_list:
         img_url = 'http://www.521609.com' + div.xpath(
             './/img/@src').extract_first().strip()
         # print('img_url:', img_url)
         item = XiaohuaItem()
         item['title'] = title
         item['img_name'] = str(num).zfill(2) + '.jpg'
         yield scrapy.Request(url=img_url,
                              callback=self.prase_img,
                              meta={'item': item})
         num += 1
Example #9
0
    def parse(self, response):

        # 获取到包含图片的li列表
        li_list = response.xpath(
            '//*[@id="content"]//div[@class="index_img list_center"]/ul/li')
        for li in li_list:
            # 匹配图片的地址,图片名称
            img_url = self.base_url + li.xpath('.//img/@src').extract_first()
            img_name = li.xpath('.//img/@alt').extract_first() + '.jpg'
            item = XiaohuaItem()  # 实例化对象
            # 将数据封装到item中,这里只能用['img_ur']的形式,不能用点的方式
            item['img_url'] = img_url
            item['img_name'] = img_name
            yield item

        # 拼接页码,并递归调用自己,达到处理所有页码的目的
        for i in range(1, 12):
            new_url = self.urls % i
            yield scrapy.Request(url=new_url, callback=self.parse)
Example #10
0
 def parse_two(self,response):
     #传入上面的item1
     item2=response.meta['item1']
     source=requests.get(response.url)
     html=source.text.encode('utf-8')
     #用正则提取页数
     pattern=re.compile(r'共(.*?)页',re.S)
     Num=6
     items=[]
     for i in range(1,int(Num)+1):
         #注意这里,创建实例的位置
         item=XiaohuaItem()
         item['fileName']=item2['fileName']
         #构造每一个图片的存储路径
         item['path']=item['fileName']+str(i)+'.jpg'
         #构造每一个图片入口链接,以获取源码中的原图链接
         item['pageURL']=response.url[:-5]+'_'+str(i)+'.html'
         items.append(item)
     for item in items:
         yield Request(url=item['pageURL'],meta={'item2':item},callback=self.parse_three)
Example #11
0
    def parse_one(self,response):
        #创建一个大的list存储所有的item
        items=[]
        pattern=re.compile(r'<div class="title".*?<a.*?href="(.*?)">(.*?)</a></span></div>',re.S)
        mains=re.findall(pattern,response.text)
        for main in mains:
            #创建实例,并转化为字典
            item=XiaohuaItem()
            item['siteURL']=main[0]
            item['title']=main[1]
            item['fileName']=self.base+item['title']
            items.append(item)

        for item in items:
            #创建文件夹
            fileName=item['fileName']
            #if not os.path.exists(fileName):
                #os.makedirs(fileName)
            #用meta传入下一层
            yield Request(url=item['siteURL'],meta={'item1':item},callback=self.parse_two)
Example #12
0
    def parse(self, response):

        if response.url.startswith('http://www.xiaohuar.com/list-'):
            picList = response.xpath('//div[@class="img"]/a')
            for pic in picList:
                item = XiaohuaItem()
                item['name'] = pic.xpath('./img/@alt').extract()[0]
                item['imgUrl'] = 'http://www.xiaohuar.com' + pic.xpath(
                    './img/@src').extract()[0]
                yield item

        urls = response.xpath('//a/@href').extract()
        for url in urls:
            if url.startswith("http://www.xiaohuar.com/list-"):
                if url in self.urlSet:
                    pass
                else:
                    self.urlSet.add(url)
                    yield self.make_requests_from_url(url)
            else:
                pass
Example #13
0
    def parse(self, response):
        # 获取所有图片标签
        # print(response.url)
        print(
            '-----------------------------------------------------------------------------------------------------------------------'
        )
        # datas=json.loads(response.body)
        # allpics=response.xpath('//div[@class="pic"]/a')
        allin = response.xpath(
            '//div[@class="p-cell cellItem nofavorite backwater"]'
        )  #可定位节点其中的一个属性

        # print(allpics)
        # for pic in allpics:
        for pic in allin:
            # 分别处理每个图片,取出名称和地址
            item = XiaohuaItem()
            # if pic.xpath('./h3/span[@class="cellTit"]/a/text()').extract():
            name = pic.xpath('./h3/span[@class="cellTit"]/a/text()').extract()
            item['name'] = name
            # name=pic.xpath('./img/@src').extract()[0]
            # name=name.split('/')[-1]
            # if pic.xpath('./div[@class="pic"]/a/img/@src').extract()[0]:
            addr = pic.xpath('./div[@class="pic"]/a/img/@src').extract()[0]
            # addr=pic.xpath('./img/@src').extract()[0]
            addr = 'http://www.xiaohua100.cn' + addr
            # addr='http://www.xiaohua100.cn'+addr
            item['addr'] = addr
            # item['name']=name
            # item['addr']=addr
            # print(item)
            yield item

        page_add = re.search(r'pageno=(\d+)', response.url).group(1)
        page_add = 'pageno=' + str(int(page_add) + 1)
        next_url = re.sub(r'pageno=\d+', page_add, response.url)
        yield Request(next_url, headers=self.headers)