Example #1
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath(
         "//div[contains(@class, 'uibox')]/ul/li//img/@src").getall()
     urls = list(map(lambda x: "https:" + x.replace("t_", ""), srcs))
     item = BmwItem(category=category, image_urls=urls)
     yield item
Example #2
0
 def parse_page(self, response):
     category = response.xpath('//div[@class="uibox"]/div/text()').get()
     # self.log(category)
     srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
     srcs = list(map(lambda x: response.urljoin(x.replace('t_', '')), srcs))
     # self.log(srcs)
     yield BmwItem(category=category, image_urls=srcs)
 def parse(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")
     for uibox in uiboxs:
         category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         urls = list(map(lambda url: response.urljoin(url), urls))
         item = BmwItem(category=category, image_urls=urls)
         yield item
Example #4
0
 def parse(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         partname = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         urls = list(map(lambda x:response.urljoin(x),urls))
         item = BmwItem(partname=partname,urls=urls)
         yield item
Example #5
0
 def parse_item(self, response):
     sort = response.xpath('//div[@class="uibox"]/div[@class="uibox-title"]/text()').extract_first()
     img_urls = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').extract()
     img_urls = list(map(lambda x: x.replace("240x180_0_q95_c42", "800x0_1_q95"), img_urls))
     img_urls = list(map(lambda x: response.urljoin(x), img_urls))
     item = BmwItem(sort=sort, image_urls=img_urls)
     print(img_urls)
     yield item
Example #6
0
 def parse_item(self, response):
     img_urls = response.xpath('//div[@class="uibox"]//img/@src').getall()
     img_urls = list(map(lambda url: url.replace("t_", ''), img_urls))
     img_urls = list(map(lambda url: response.urljoin(url), img_urls))
     category = response.xpath(
         '//div[@class="uibox-title"]/text()').getall()
     category = [i.strip() for i in category if i.strip()][0]
     yield BmwItem(category=category, image_urls=img_urls)
Example #7
0
 def parse(self, response):
     uiboxes = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxes:
         category = uibox.xpath(
             "./div[@class='uibox-title']/a/text()").get()
         image_urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         image_urls = list(map(lambda url: "https:" + url, image_urls))
         item = BmwItem(category=category, image_urls=image_urls)
         yield item
Example #8
0
 def parse(self, response):
     uiboxs = response.xpath('//div[@class="uibox"]')[1:]
     for uibox in uiboxs:
         category = uibox.xpath(
             './/div[@class="uibox-title"]/a/text()').get()
         image_urls = uibox.xpath('.//ul/li/a/img/@src').getall()
         urls = list(map(lambda url: response.urljoin(url), image_urls))
         item = BmwItem(category=category, image_urls=urls)
         yield item
Example #9
0
 def parse_item(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
     # print(srcs)
     # for src in srcs:
     #     src = response.urljoin(src)
     #     print(src)
     srcs = list(map(lambda x:response.urljoin(x.replace("240x180_0_q95_c42_","")),srcs))
     yield BmwItem(category=category,image_urls=srcs)
Example #10
0
 def img_tp(self, response):
     bmbq = str(response.meta.get('info'))[2:-2]
     img1 = []
     img = response.xpath('//*[@id="img"]/@src').get()
     img_new = response.urljoin(img)
     img1.append(img_new)
     print(img1)
     item = BmwItem(category=bmbq, image_urls=img1)
     yield item
     img1.pop()
Example #11
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath(
         "//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
     # srcs = list(map(lambda x:x.replace("t_",""),srcs))
     # urls = []
     # for src in srcs:
     #     url = response.urljoin(src)
     #     urls.append(url)
     srcs = list(map(lambda x: response.urljoin(x.replace("t_", "")), srcs))
     yield BmwItem(category=category, image_urls=srcs)
Example #12
0
 def parse_page(self, response):
     category = response.xpath('//div[@class="uibox"]/div/text()').get()
     srcs = response.xpath('//div[contains(@class,"uibox-con")]/ul/li/a/img/@src').getall()
     # srcs = list(map(lambda x:x.replace('240x180_0_q95_c42_', ''), srcs))
     # urls = []
     # for src in srcs:
     #     url = response.urljoin(src)
     #     urls.append(url)
     # srcs = list(map(lambda x:response.urljoin(x), srcs))
     srcs = list(map(lambda x:response.urljoin(x.replace('240x180_0_q95_c42_', '')), srcs))
     yield BmwItem(category=category, image_urls=srcs)
Example #13
0
 def parse(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         category = uibox.xpath(
             ".//div[@class='uibox-title']/a/text()").get()
         img_urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         # for img_url in img_urls:
         #     img_url = response.urljoin(img_url)
         #     print(img_url)
         urls = list(
             map(lambda img_url: response.urljoin(img_url), img_urls))
         yield BmwItem(category=category, image_urls=urls)
Example #14
0
 def parse_old(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         category = uibox.xpath("div[@class='uibox-title']/a/text()").get()
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         # for url in urls:
         #     url = response.urljoin(url)
         #     print(url)
         urls = list(map(lambda url: response.urljoin(url), urls))
         item = BmwItem(category=category, image_urls=urls)
         yield item
         print(response.request.meta['proxy'])
Example #15
0
 def parse(self, response):
     # 不要第0个,从第一个开始[1:]
     uiboxs=response.xpath('//div[@class="uibox"]')[1:]
     for uibox in uiboxs:
         title=uibox.xpath('./div[@class="uibox-title"]/a[1]/text()').get()
         hrefs=uibox.xpath('.//ul/li/a/img/@src').getall()
         # for href in hrefs:
         #     href=response.urljoin(href)
         #     print(href)
         hrefs=list(map(lambda href:response.urljoin(href),hrefs))
         item=BmwItem(title=title,image_urls=hrefs)
         yield item
Example #16
0
 def parse_pic(self, response):
     category = response.xpath(
         "//div[@class='uibox']/div/text()").extract_first()
     srcs = response.xpath(
         "//div[contains(@class,'uibox-con carpic-list03')]//li//img/@src"
     ).extract()
     for src in srcs:
         src = src.replace("/t_", "/1024x0_1_q87_")
         image_url = "https:" + src
         yield BmwItem(category=category,
                       image_urls=[image_url],
                       page_url=response.url)
Example #17
0
 def parse(self, response):
     #SelectorList -> list
     uiboxs = response.xpath(".//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         # for url in urls:
         #     url = response.urljoin(url)
         #     print(url)
         urls = list(map(lambda url:response.urljoin(url),urls))
         item = BmwItem(category=category,image_urls = urls)
         yield item
Example #18
0
 def test_parse(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         category = uibox.xpath(
             ".//div[@class = 'uibox-title']/a/text()").get()
         urls = uibox.xpath(".//li/a/img/@src").getall()
         # for url in urls:
         # url = "https:" + url
         #url = response.urljoin(url)
         urls = map(lambda url: response.urljoin(url), urls)
         print(urls)
         item = BmwItem(category=category, image_urls=urls)
         yield item
Example #19
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath(
         "//div[contains(@class,'uibox-con')]/ul/li/a/img/@src").getall()
     urls = list(
         map(
             lambda x: response.urljoin(
                 x.replace("240x180_0_q95_c42", "800x0_1_q95")), srcs))
     # for src in srcs:
     #     url = response.urljoin(src)
     #     urls.append(url)
     item = BmwItem(category=category, image_urls=urls)
     yield item
Example #20
0
    def parse(self, response):
        uiboxs = response.xpath('//div[@class="uibox"]')
        for uibox in uiboxs:
        	title = uibox.xpath('./div[@class="uibox-title"]/a/text()').get()
        	img_urls = uibox.xpath('.//ul/li[position()<last()]//img/@src').getall()

        	img_urls = [tran_bigurl.sub('',response.urljoin( umg_url)) for umg_url in img_urls]

        	item = BmwItem(
        			title=title,
        			image_urls = img_urls
        		)
        	yield item
Example #21
0
 def parse(self, response):
     # response是scrapy自己帮我们传回的响应数据,直接拿来用
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         # 获取类别名称
         category = uibox.xpath(
             ".//div[@class='uibox-title']/a/text()").get()
         # 获取图片下载路径
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         # map需要一个函数和list,lambda匿名函数使用urljoin方法补齐urls列表传递过来的每一个url参数
         urls = list(map(lambda url: response.urljoin(url), urls))
         # 引用BmwItem,传递必须的参数
         item = BmwItem(category=category, image_urls=urls)
         yield item
Example #22
0
 def test_parse(self, response):
     uiboxs = response.xpath("//div[@class='uibox']")[1:]
     for uibox in uiboxs:
         category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
         urls = uibox.xpath(".//ul/li/a/img/@src").getall()
         # for url in urls:
         #     url = response.urljoin(url)
         #     self.log(url)
         # self.log(category)
         # self.log(urls)
         # pass
         urls = list(map(lambda url: response.urljoin(url), urls))
         item = BmwItem(category=category, urls=urls)
         yield item
Example #23
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     print(category)
     print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
     srcs = response.xpath(
         "//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
     # 把map转化为列表,把小图的url变成大图的url,再加上前缀
     srcs = list(
         map(
             lambda url: response.urljoin(
                 url.replace("240x180_0_q95_c42", "1024x0_1_q95")), srcs))
     # map(lambda url:response.urljoin(url),srcs)
     # for i in srcs:
     #     print(i)
     yield BmwItem(category=category, image_urls=srcs)
Example #24
0
 def parse(self, response):
     uiboxes = response.xpath(r'//div[@class="uibox"]')[1:]  #[1:]表示为不要第一个元素
     for uibox in uiboxes:
         category = uibox.xpath(r'.//div[@class="uibox-title"]/a/text()'
                                ).get()  #点表示在当前循环内的,点get表示只获取一份
         # print(category) #打印标题
         urls = uibox.xpath(
             r'.//ul/li/a/img/@src').getall()  #获取图片链接,getall()提取所有
         # print(urls) #这样得到的链接是前面缺少https的
         ''' for url in urls:
             # url="https:"+url  #第1种方法在url前面加https:
             url=response.urljoin(url)   #第2种方法在url前面加https: 也就是说会自动添加
             print(url) '''
         urls = list(map(lambda url: response.urljoin(url),
                         urls))  #第3种方法url前面加https:
         # print(urls)
         item = BmwItem(category=category, urls=urls)
         yield item
Example #25
0
    def parse_page(self, response):
        # 因为得到的第一个不需要,所以用[1:]
        # uiboxs = response.xpath("//div[@class='uibox']")[1:]
        # categorys=[]
        # for uibox in uiboxs:
        #     # get()只得到第一个text文本
        #     category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
        #     categorys.append(category)
        category = response.xpath("//div[@class='uibox']/div/text()").get()

        #class 有好几个所以用contains只要写出其中一个
        srcs = response.xpath(
            "//div[contains(@class,'uibox-con')]/ul/li/a/img/@src").getall()
        srcs = list(map(lambda x: x.replace("240x180_0_q95_c42_", ""), srcs))
        # urls=[]
        # for src in srcs:
        #     url=response.urljoin(src)
        #     urls.append(url)
        srcs = list(map(lambda x: response.urljoin(x), srcs))
        yield BmwItem(category=category, image_urls=srcs)
Example #26
0
 def parse_page(self, response):  # 用了crawlspider的话就不能重写parse方法了
     title = response.xpath('//div[@class="uibox"]/div/text()').extract()[0]
     # href 是引用 多用于 css 文档文件或者a链接超文本 link
     # src是引入 多用于图片 js iframe
     # div中class="uibox-con carpic-list03 border-b-solid" 是由多个空格分隔的,所以我们要用这个 至于为什么只用uibox-con,是因为carpic-list03 border-b-solid是样式,我们不用class是样式的
     url_list = response.xpath(
         '//div[contains(@class,"uibox-con")]//img/@src').extract()
     # 这下面的图片要改成2024....才可以正常访问
     if '202-51' in response.url:
         url_list = list(
             map(
                 lambda url: response.urljoin(
                     url.replace("autohomecar", "1024x0_1_q95_autohomecar")
                 ), url_list))
     url_list = list(
         map(
             lambda url: response.urljoin(
                 url.replace("240x180_0_q95_c42_", "")), url_list))
     item = BmwItem(title=title, image_urls=url_list)
     yield item
Example #27
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath(
         '//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()
     srcs = list(map(lambda x: response.urljoin(x.replace("t_", "")), srcs))
     yield BmwItem(category=category, image_urls=srcs)
Example #28
0
from bmw.items import BmwItem
class Bmw5Spider(scrapy.Spider):
    name = 'bmw5'
    allowed_domains = ['autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    def parse(self, response):
<<<<<<< HEAD
>>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688
=======
>>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688
        uiboxs = response.xpath("//div[@class='uibox']")[1:]
        for uibox in uiboxs:
            category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
            urls = uibox.xpath(".//ul/li/a/img/@src").getall()
            # for url in urls:
            #     # url = "https:"+url
            #     url = response.urljoin(url)
            #     print(url)
            urls =list(map(lambda url:response.urljoin(url),urls))
<<<<<<< HEAD
<<<<<<< HEAD
            item = BmwItem(category=category,image_urls=urls)
=======
            item = BmwItem(category=category,urls=urls)
>>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688
=======
            item = BmwItem(category=category,urls=urls)
>>>>>>> c7bd8bff104fbf41cb4953006ae4266c6b6df688
            yield item
Example #29
0
 def parse_page(self, response):
     category = response.xpath("//div[@class='uibox']/div/text()").get()
     srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
     srcs = list(map(lambda x:response.urljoin(x.replace("t_","")),srcs))  #把略缩图的_t去掉,补上https:// 就是高清图的地址
     yield BmwItem(category=category,image_urls=srcs)