def parse(self, response): li_list = response.xpath('/html/body/div/div[2]/div[4]/ul/li') for li in li_list: img_src = li.xpath('./div/a/div/img/@src').extract_first() item = ImgsproItem() item['src'] = img_src yield item
def parse(self, response): div_list = response.xpath('//div[@id="container"]/div') for div in div_list: # 注意:使用伪属性 src = div.xpath('./div/a/img/@src2').extract_first() item = ImgsproItem() item['src'] = src print(item) yield item
def parse(self, response): div_list = response.xpath('//div[@id="container"]/div') for div in div_list: src = 'https:' + div.xpath('./div/a/img/@src2').get() item = ImgsproItem() item['src'] = src yield item
def parse(self, response): div_list = response.xpath('//*[@id="container"]/div') for div in div_list: #src2伪属性 src = div.xpath('./div/a/img/@src2').extract_first().split('_')[0] src = 'https:'+ src + '.jpg' print(src) item = ImgsproItem() item['src'] = src yield item
def parse(self, response): div_list = response.xpath('//div[@id="container"]/div') for div in div_list: # 反爬机制:图片懒加载:当图片被滑动到可视界面时图片地址由 src2 变为 src #对应:使用伪属性 src2 # src = div.xpath('./div/a/img/@src | ./div/a/img/@src2').extract_first() src = div.xpath('./div/a/img/@src2').extract_first() # print(src) item = ImgsproItem() item['src'] = src #把src传给 item yield item
def parse(self, response): div_list = response.xpath('//div[@id="container"]/div') for div in div_list: img_name = div.xpath('./div/a/@alt').extract_first() # print(img_name) # 这是一种软加载的模式,只有当前查看的img的属性是src,其它是src2,而这里没有可视化界面,也就是全部都是src2 src = div.xpath('./div/a/img/@src2').extract_first() # print(src) item = ImgsproItem() item['src'] = src item['img_name'] = img_name yield item
def parse(self, response): div_list = response.xpath('//div[@id="container"]/div') for div in div_list: # 注意:页面设置了图片懒加载,使用伪属性src2获取图片src src = div.xpath('./div/a/img/@src2').extract_first() print(src) item = ImgsproItem() item['src'] = src # 提交item到管道 yield item
def parse(self, response): #显示请求头信息 print(response.request.headers) div_list = response.xpath('//div[@id="container"]/div') for div in div_list: #注意:使用伪属性 src = div.xpath('./div/a/img/@src2').extract_first() name = div.xpath('./div/a/img/@alt').extract_first() item = ImgsproItem() item['src'] = src item['name'] = name yield item
def parse(self, response): div_list = response.xpath('//*[@id="container"]/div') # print(div_list) num = 1 for div in div_list: # 注意:图片懒加载要使用其伪属性 src = 'https:' + div.xpath('./div/a/img/@src2')[0].extract() imgName = div.xpath('./div/a/@alt')[0].extract() print('正在爬取%s: %s' % (imgName, src)) item = ImgsproItem() item['src'] = src item['imgName'] = imgName yield item if num == 3: break num += 1