def parse(self, response): item = QtpjtItem() paturl = "(http://pic.qiantucdn.com/58pic/.*?).jpg" item['picurl'] = re.compile(paturl).findall(str(response.body)) patid = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item['picid'] = re.compile(patid).findall(str(response.body)) yield item
def parse(self, response): item = QtpjtItem() paturl = '(http://pic.qiantucdn.com/58pic/.*?).jpg' item['picurl'] = re.compile(paturl).findall(str(response.body)) patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item['picid'] = re.compile(patlocal).findall(str(response.body)) yield item for i in range(1, 101): nexturl = "http://www.58pic.com/piccate/3-0-0-" + str(i) + ".html" yield Request(nexturl, callback=self.parse)
def parse(self, response): item = QtpjtItem() picurl = "(http://pic.qiantucdn.com/58pic/.*?).[jpeg|jpg]" item['picurl'] = re.compile(picurl).findall(str(response.body)) item['picid'] = response.xpath('//a[@class="bottom-title"]/text()') yield item for i in range(1, 20): nexturl = 'http://www.58pic.com/piccate/3-0-0-' + str(i) + '.html' yield Request(nexturl, callback=self.parse)
def parse(self, response): # pass item = QtpjtItem() paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg" item["picurl"] = retest.compile(paturl).findall(str(response.body)) patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item["picid"] = retest.compile(patlocal).findall(str(response.body)) yield item for i in range(1,201): nexturl="http://www.58pic.com/tb/id-"+str(i)+".html" yield Request(nexturl,callback=self.parse)
def parse(self, response): item = QtpjtItem() pat_url = "(http://pic.qiantucdn.com/58pic/.*?).jpg" item['picurl'] = re.findall(pat_url, str(response.body), re.I) pat_local = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item['picid'] = re.findall(pat_local, str(response.body), re.I) yield item # 通过for循环遍历1~3页的内容 for i in range(2, 4): nexturl = "http://www.58pic.com/tupian/yuebing-0-0-" + str( i) + ".html" yield Request(nexturl, callback=self.parse)
def parse(self, response): item = QtpjtItem() paturl = "(http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/.*?.jpg!)qt324" item["picurl"] = re.compile(paturl).findall(str(response.body)) patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item["picid"] = re.compile(patlocal).findall(str(response.body)) yield item for i in range(1,11): nexturl = "http://www.58pic.com/piccate/3-0-0-default-0_2_0_0_default_0-"+str(i)+".html" yield Request(nexturl,callback=self.parse) pass
def parse(self, response): item = QtpjtItem() item["picfolder"] = response.xpath( "//em[@class='text-green-b']/text()").extract() folder = os.path.exists( 'C:\\Users\\leishen\\Documents\\anaconda3\\scrapy\\master python scrapy\\chapter 19\\pic' + '\\' + item["picfolder"][0]) if not folder: os.mkdir( 'C:\\Users\\leishen\\Documents\\anaconda3\\scrapy\\master python scrapy\\chapter 19\\pic' + '\\' + item["picfolder"][0]) item["link"] = response.xpath( "//a[@class='thumb-box']/@href").extract() #经过测试,成功 # headers = {"Accept-Encoding":"utf-8,gb2312","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"} headers = ( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" ) opener = urllib.request.build_opener() opener.addheaders = [headers] # 将opener安装为全局 urllib.request.install_opener(opener) for m in range(0, len(item["link"])): data = urllib.request.urlopen(item["link"][m]).read() paturl = '<img src="(http.*?)".*?show-area-pic' item["picurl"] = re.compile(paturl).findall(str(data)) yield item # data为对应博客列表页的所有博文的点击数与评论数数据 # data = urllib.request.urlopen(item["link"][m]).read().decode('gb2312') # for k in range(0, len(item["picurl"])): # patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" # item["picid"][k]=re.compile(patlocal).findall(str(item["picurl"][k]))[0]+"-"+str(k) # item["picid"]=tupianm # picid = 'id="show-area-pic".*?alt="(.*?)"' # item["picid"] = re.compile(picid).findall(str(data)) #通过for循环依次遍历1到200页图片列表页 for i in range(2, 3): #构造出下一页图片列表页的网址 nexturl = "http://www.58pic.com/piccate/3-0-0-default-0_2_0_0_default_0-" + str( i) + ".html" yield Request(nexturl, callback=self.parse) '''
def parse(self, response): item=QtpjtItem() #构建提取缩略图网址的正则表达式 paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg"