Example #1
0
 def parse(self, response):
     item = QtpjtItem()
     paturl = "(http://pic.qiantucdn.com/58pic/.*?).jpg"
     item['picurl'] = re.compile(paturl).findall(str(response.body))
     patid = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
     item['picid'] = re.compile(patid).findall(str(response.body))
     yield item
Example #2
0
 def parse(self, response):
     item = QtpjtItem()
     paturl = '(http://pic.qiantucdn.com/58pic/.*?).jpg'
     item['picurl'] = re.compile(paturl).findall(str(response.body))
     patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
     item['picid'] = re.compile(patlocal).findall(str(response.body))
     yield item
     for i in range(1, 101):
         nexturl = "http://www.58pic.com/piccate/3-0-0-" + str(i) + ".html"
         yield Request(nexturl, callback=self.parse)
    def parse(self, response):
        item = QtpjtItem()
        picurl = "(http://pic.qiantucdn.com/58pic/.*?).[jpeg|jpg]"
        item['picurl'] = re.compile(picurl).findall(str(response.body))
        item['picid'] = response.xpath('//a[@class="bottom-title"]/text()')
        yield item

        for i in range(1, 20):
            nexturl = 'http://www.58pic.com/piccate/3-0-0-' + str(i) + '.html'
            yield Request(nexturl, callback=self.parse)
Example #4
0
 def parse(self, response):
     # pass
     item = QtpjtItem()
     paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg"
     item["picurl"] = retest.compile(paturl).findall(str(response.body))
     patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
     item["picid"] = retest.compile(patlocal).findall(str(response.body))
     yield item
     for i in range(1,201):
         nexturl="http://www.58pic.com/tb/id-"+str(i)+".html"
         yield Request(nexturl,callback=self.parse)
Example #5
0
    def parse(self, response):
        item = QtpjtItem()
        pat_url = "(http://pic.qiantucdn.com/58pic/.*?).jpg"
        item['picurl'] = re.findall(pat_url, str(response.body), re.I)

        pat_local = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"

        item['picid'] = re.findall(pat_local, str(response.body), re.I)
        yield item

        # 通过for循环遍历1~3页的内容
        for i in range(2, 4):
            nexturl = "http://www.58pic.com/tupian/yuebing-0-0-" + str(
                i) + ".html"
            yield Request(nexturl, callback=self.parse)
Example #6
0
    def parse(self, response):
        item = QtpjtItem()

        paturl = "(http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/.*?.jpg!)qt324"
        item["picurl"] = re.compile(paturl).findall(str(response.body))
        patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
        item["picid"] = re.compile(patlocal).findall(str(response.body))

        yield item
        for i in range(1,11):
            nexturl = "http://www.58pic.com/piccate/3-0-0-default-0_2_0_0_default_0-"+str(i)+".html"
            yield Request(nexturl,callback=self.parse)


        pass
Example #7
0
    def parse(self, response):
        item = QtpjtItem()
        item["picfolder"] = response.xpath(
            "//em[@class='text-green-b']/text()").extract()
        folder = os.path.exists(
            'C:\\Users\\leishen\\Documents\\anaconda3\\scrapy\\master python scrapy\\chapter 19\\pic'
            + '\\' + item["picfolder"][0])
        if not folder:
            os.mkdir(
                'C:\\Users\\leishen\\Documents\\anaconda3\\scrapy\\master python scrapy\\chapter 19\\pic'
                + '\\' + item["picfolder"][0])
        item["link"] = response.xpath(
            "//a[@class='thumb-box']/@href").extract()  #经过测试,成功
        # headers = {"Accept-Encoding":"utf-8,gb2312","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"}
        headers = (
            "User-Agent",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        # 将opener安装为全局
        urllib.request.install_opener(opener)
        for m in range(0, len(item["link"])):
            data = urllib.request.urlopen(item["link"][m]).read()
            paturl = '<img src="(http.*?)".*?show-area-pic'
            item["picurl"] = re.compile(paturl).findall(str(data))
            yield item
            # data为对应博客列表页的所有博文的点击数与评论数数据
            # data = urllib.request.urlopen(item["link"][m]).read().decode('gb2312')
            # for k in range(0, len(item["picurl"])):
            #     patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg"
            #     item["picid"][k]=re.compile(patlocal).findall(str(item["picurl"][k]))[0]+"-"+str(k)
            # item["picid"]=tupianm
            # picid = 'id="show-area-pic".*?alt="(.*?)"'
            # item["picid"] = re.compile(picid).findall(str(data))

#通过for循环依次遍历1到200页图片列表页
        for i in range(2, 3):
            #构造出下一页图片列表页的网址
            nexturl = "http://www.58pic.com/piccate/3-0-0-default-0_2_0_0_default_0-" + str(
                i) + ".html"
            yield Request(nexturl, callback=self.parse)
        '''
    def parse(self, response):
        item=QtpjtItem()
#构建提取缩略图网址的正则表达式
        paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg"