def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath( "//div[contains(@class, 'uibox')]/ul/li//img/@src").getall() urls = list(map(lambda x: "https:" + x.replace("t_", ""), srcs)) item = BmwItem(category=category, image_urls=urls) yield item
def parse_page(self, response): category = response.xpath('//div[@class="uibox"]/div/text()').get() # self.log(category) srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall() srcs = list(map(lambda x: response.urljoin(x.replace('t_', '')), srcs)) # self.log(srcs) yield BmwItem(category=category, image_urls=srcs)
def parse(self, response): uiboxs = response.xpath("//div[@class='uibox']") for uibox in uiboxs: category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() urls = list(map(lambda url: response.urljoin(url), urls)) item = BmwItem(category=category, image_urls=urls) yield item
def parse(self, response): uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: partname = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() urls = list(map(lambda x:response.urljoin(x),urls)) item = BmwItem(partname=partname,urls=urls) yield item
def parse_item(self, response): sort = response.xpath('//div[@class="uibox"]/div[@class="uibox-title"]/text()').extract_first() img_urls = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').extract() img_urls = list(map(lambda x: x.replace("240x180_0_q95_c42", "800x0_1_q95"), img_urls)) img_urls = list(map(lambda x: response.urljoin(x), img_urls)) item = BmwItem(sort=sort, image_urls=img_urls) print(img_urls) yield item
def parse_item(self, response): img_urls = response.xpath('//div[@class="uibox"]//img/@src').getall() img_urls = list(map(lambda url: url.replace("t_", ''), img_urls)) img_urls = list(map(lambda url: response.urljoin(url), img_urls)) category = response.xpath( '//div[@class="uibox-title"]/text()').getall() category = [i.strip() for i in category if i.strip()][0] yield BmwItem(category=category, image_urls=img_urls)
def parse(self, response): uiboxes = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxes: category = uibox.xpath( "./div[@class='uibox-title']/a/text()").get() image_urls = uibox.xpath(".//ul/li/a/img/@src").getall() image_urls = list(map(lambda url: "https:" + url, image_urls)) item = BmwItem(category=category, image_urls=image_urls) yield item
def parse(self, response): uiboxs = response.xpath('//div[@class="uibox"]')[1:] for uibox in uiboxs: category = uibox.xpath( './/div[@class="uibox-title"]/a/text()').get() image_urls = uibox.xpath('.//ul/li/a/img/@src').getall() urls = list(map(lambda url: response.urljoin(url), image_urls)) item = BmwItem(category=category, image_urls=urls) yield item
def parse_item(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall() # print(srcs) # for src in srcs: # src = response.urljoin(src) # print(src) srcs = list(map(lambda x:response.urljoin(x.replace("240x180_0_q95_c42_","")),srcs)) yield BmwItem(category=category,image_urls=srcs)
def img_tp(self, response): bmbq = str(response.meta.get('info'))[2:-2] img1 = [] img = response.xpath('//*[@id="img"]/@src').get() img_new = response.urljoin(img) img1.append(img_new) print(img1) item = BmwItem(category=bmbq, image_urls=img1) yield item img1.pop()
def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath( "//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall() # srcs = list(map(lambda x:x.replace("t_",""),srcs)) # urls = [] # for src in srcs: # url = response.urljoin(src) # urls.append(url) srcs = list(map(lambda x: response.urljoin(x.replace("t_", "")), srcs)) yield BmwItem(category=category, image_urls=srcs)
def parse_page(self, response): category = response.xpath('//div[@class="uibox"]/div/text()').get() srcs = response.xpath('//div[contains(@class,"uibox-con")]/ul/li/a/img/@src').getall() # srcs = list(map(lambda x:x.replace('240x180_0_q95_c42_', ''), srcs)) # urls = [] # for src in srcs: # url = response.urljoin(src) # urls.append(url) # srcs = list(map(lambda x:response.urljoin(x), srcs)) srcs = list(map(lambda x:response.urljoin(x.replace('240x180_0_q95_c42_', '')), srcs)) yield BmwItem(category=category, image_urls=srcs)
def parse(self, response): uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath( ".//div[@class='uibox-title']/a/text()").get() img_urls = uibox.xpath(".//ul/li/a/img/@src").getall() # for img_url in img_urls: # img_url = response.urljoin(img_url) # print(img_url) urls = list( map(lambda img_url: response.urljoin(img_url), img_urls)) yield BmwItem(category=category, image_urls=urls)
def parse_old(self, response): uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath("div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() # for url in urls: # url = response.urljoin(url) # print(url) urls = list(map(lambda url: response.urljoin(url), urls)) item = BmwItem(category=category, image_urls=urls) yield item print(response.request.meta['proxy'])
def parse(self, response): # 不要第0个,从第一个开始[1:] uiboxs=response.xpath('//div[@class="uibox"]')[1:] for uibox in uiboxs: title=uibox.xpath('./div[@class="uibox-title"]/a[1]/text()').get() hrefs=uibox.xpath('.//ul/li/a/img/@src').getall() # for href in hrefs: # href=response.urljoin(href) # print(href) hrefs=list(map(lambda href:response.urljoin(href),hrefs)) item=BmwItem(title=title,image_urls=hrefs) yield item
def parse_pic(self, response): category = response.xpath( "//div[@class='uibox']/div/text()").extract_first() srcs = response.xpath( "//div[contains(@class,'uibox-con carpic-list03')]//li//img/@src" ).extract() for src in srcs: src = src.replace("/t_", "/1024x0_1_q87_") image_url = "https:" + src yield BmwItem(category=category, image_urls=[image_url], page_url=response.url)
def parse(self, response): #SelectorList -> list uiboxs = response.xpath(".//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() # for url in urls: # url = response.urljoin(url) # print(url) urls = list(map(lambda url:response.urljoin(url),urls)) item = BmwItem(category=category,image_urls = urls) yield item
def test_parse(self, response): uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath( ".//div[@class = 'uibox-title']/a/text()").get() urls = uibox.xpath(".//li/a/img/@src").getall() # for url in urls: # url = "https:" + url #url = response.urljoin(url) urls = map(lambda url: response.urljoin(url), urls) print(urls) item = BmwItem(category=category, image_urls=urls) yield item
def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath( "//div[contains(@class,'uibox-con')]/ul/li/a/img/@src").getall() urls = list( map( lambda x: response.urljoin( x.replace("240x180_0_q95_c42", "800x0_1_q95")), srcs)) # for src in srcs: # url = response.urljoin(src) # urls.append(url) item = BmwItem(category=category, image_urls=urls) yield item
def parse(self, response): uiboxs = response.xpath('//div[@class="uibox"]') for uibox in uiboxs: title = uibox.xpath('./div[@class="uibox-title"]/a/text()').get() img_urls = uibox.xpath('.//ul/li[position()<last()]//img/@src').getall() img_urls = [tran_bigurl.sub('',response.urljoin( umg_url)) for umg_url in img_urls] item = BmwItem( title=title, image_urls = img_urls ) yield item
def parse(self, response): # response是scrapy自己帮我们传回的响应数据,直接拿来用 uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: # 获取类别名称 category = uibox.xpath( ".//div[@class='uibox-title']/a/text()").get() # 获取图片下载路径 urls = uibox.xpath(".//ul/li/a/img/@src").getall() # map需要一个函数和list,lambda匿名函数使用urljoin方法补齐urls列表传递过来的每一个url参数 urls = list(map(lambda url: response.urljoin(url), urls)) # 引用BmwItem,传递必须的参数 item = BmwItem(category=category, image_urls=urls) yield item
def test_parse(self, response): uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() # for url in urls: # url = response.urljoin(url) # self.log(url) # self.log(category) # self.log(urls) # pass urls = list(map(lambda url: response.urljoin(url), urls)) item = BmwItem(category=category, urls=urls) yield item
def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() print(category) print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") srcs = response.xpath( "//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall() # 把map转化为列表,把小图的url变成大图的url,再加上前缀 srcs = list( map( lambda url: response.urljoin( url.replace("240x180_0_q95_c42", "1024x0_1_q95")), srcs)) # map(lambda url:response.urljoin(url),srcs) # for i in srcs: # print(i) yield BmwItem(category=category, image_urls=srcs)
def parse(self, response): uiboxes = response.xpath(r'//div[@class="uibox"]')[1:] #[1:]表示为不要第一个元素 for uibox in uiboxes: category = uibox.xpath(r'.//div[@class="uibox-title"]/a/text()' ).get() #点表示在当前循环内的,点get表示只获取一份 # print(category) #打印标题 urls = uibox.xpath( r'.//ul/li/a/img/@src').getall() #获取图片链接,getall()提取所有 # print(urls) #这样得到的链接是前面缺少https的 ''' for url in urls: # url="https:"+url #第1种方法在url前面加https: url=response.urljoin(url) #第2种方法在url前面加https: 也就是说会自动添加 print(url) ''' urls = list(map(lambda url: response.urljoin(url), urls)) #第3种方法url前面加https: # print(urls) item = BmwItem(category=category, urls=urls) yield item
def parse_page(self, response): # 因为得到的第一个不需要,所以用[1:] # uiboxs = response.xpath("//div[@class='uibox']")[1:] # categorys=[] # for uibox in uiboxs: # # get()只得到第一个text文本 # category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() # categorys.append(category) category = response.xpath("//div[@class='uibox']/div/text()").get() #class 有好几个所以用contains只要写出其中一个 srcs = response.xpath( "//div[contains(@class,'uibox-con')]/ul/li/a/img/@src").getall() srcs = list(map(lambda x: x.replace("240x180_0_q95_c42_", ""), srcs)) # urls=[] # for src in srcs: # url=response.urljoin(src) # urls.append(url) srcs = list(map(lambda x: response.urljoin(x), srcs)) yield BmwItem(category=category, image_urls=srcs)
def parse_page(self, response): # 用了crawlspider的话就不能重写parse方法了 title = response.xpath('//div[@class="uibox"]/div/text()').extract()[0] # href 是引用 多用于 css 文档文件或者a链接超文本 link # src是引入 多用于图片 js iframe # div中class="uibox-con carpic-list03 border-b-solid" 是由多个空格分隔的,所以我们要用这个 至于为什么只用uibox-con,是因为carpic-list03 border-b-solid是样式,我们不用class是样式的 url_list = response.xpath( '//div[contains(@class,"uibox-con")]//img/@src').extract() # 这下面的图片要改成2024....才可以正常访问 if '202-51' in response.url: url_list = list( map( lambda url: response.urljoin( url.replace("autohomecar", "1024x0_1_q95_autohomecar") ), url_list)) url_list = list( map( lambda url: response.urljoin( url.replace("240x180_0_q95_c42_", "")), url_list)) item = BmwItem(title=title, image_urls=url_list) yield item
def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath( '//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall() srcs = list(map(lambda x: response.urljoin(x.replace("t_", "")), srcs)) yield BmwItem(category=category, image_urls=srcs)
from bmw.items import BmwItem class Bmw5Spider(scrapy.Spider): name = 'bmw5' allowed_domains = ['autohome.com.cn'] start_urls = ['https://car.autohome.com.cn/pic/series/65.html'] def parse(self, response): <<<<<<< HEAD >>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688 ======= >>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688 uiboxs = response.xpath("//div[@class='uibox']")[1:] for uibox in uiboxs: category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get() urls = uibox.xpath(".//ul/li/a/img/@src").getall() # for url in urls: # # url = "https:"+url # url = response.urljoin(url) # print(url) urls =list(map(lambda url:response.urljoin(url),urls)) <<<<<<< HEAD <<<<<<< HEAD item = BmwItem(category=category,image_urls=urls) ======= item = BmwItem(category=category,urls=urls) >>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688 ======= item = BmwItem(category=category,urls=urls) >>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688 yield item
def parse_page(self, response): category = response.xpath("//div[@class='uibox']/div/text()").get() srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall() srcs = list(map(lambda x:response.urljoin(x.replace("t_","")),srcs)) #把略缩图的_t去掉,补上https:// 就是高清图的地址 yield BmwItem(category=category,image_urls=srcs)