def parse(self, response): self.log('item page url is ==== ' + response.url) moivelist = response.xpath("//li[@class='clearfix']") for m in moivelist: item = ScrapyDemoItem() item['cover'] = m.xpath('./a/img/@src')[0].extract() item['title'] = m.xpath('./a/@title')[0].extract() item['dec'] = m.xpath("./div/div[@class='index-intro']/a/text()").extract()[0] #print(item) #yield item ''' 添加个[0], 因为 xpath() 返回的结果是列表类型。 如果不添加,运行结果会返回一个列表,而不是文本信息。 ''' #item['playUrl'] = response.xpath("//*[@id='iframeId']/@src")[0].extract() #item['playUrl'] = 'https://www.vmovier.com/' + response.xpath("//*[@id='post-list']/li/@data-id")[0] + '?from=index_new_img' #为什么要在 [0] 后面添加 extract()方法,这里涉及到内建选择器 Selecter 的知识。 # extract()方法的作用是串行化并将匹配到的节点返回一个unicode字符串列表。把html 标签去掉 # 提取电影详细页面 url 地址 urlitem = m.xpath('./a/@href')[0].extract() url = response.urljoin(urlitem) yield scrapy.Request(url, callback=self.parse_moive, meta={ 'cover': item['cover'], 'title': item['title'], 'dec': item['dec'], })
def parse_item(self, response): """ This function parse_item a property page. @url http://172.28.128.1:9312/properties/index_00000.html @returns items 1 @scrapes title price description address images @scrapes url project spider server date """ ld = ItemLoader(item=ScrapyDemoItem(), response=response) ld.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) ld.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') ld.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) ld.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) ld.add_xpath( 'images', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urllib.parse.urljoin(response.url, i))) # 使用add_value添加单个值 ld.add_value('url', response.url) ld.add_value('project', self.settings.get('BOT_NAME')) ld.add_value('spider', self.name) ld.add_value('server', socket.gethostname()) ld.add_value('date', datetime.datetime.now()) return ld.load_item()
def parse(self, response): # print(response.body) it = ScrapyDemoItem() it['json'] = response.body yield it yield Request(self.url, callback=self.parse, dont_filter=True) pass
def parse_moive(self, response): item = ScrapyDemoItem() item['cover'] = response.meta['cover'] item['title'] = response.meta['title'] item['dec'] = response.meta['dec'] item['playUrl'] = response.xpath( "//div[@class='p00b204e980']/p/iframe/@src")[0].extract() yield item
def parse(self, response): items = [] for each in response.xpath("//div[@class='li_txt']"): item = ScrapyDemoItem() item['name'] = each.xpath("h3/text()").extract()[0] item['level'] = each.xpath("h4/text()").extract()[0] item['info'] = each.xpath("p/text()").extract()[0] # items.append(item) yield item
def parse(self, response): item = ScrapyDemoItem() item['title'] = response.xpath('//title/text()').get() item['content'] = response.xpath( '//div[@itemprop="articleBody"]//p/text()').getall() title = response.xpath('//title/text()').get() content = response.xpath( '//div[@itemprop="articleBody"]//p/text()').getall() #yield item print(title) print(content) yield item
def parse(self, response): lists = [] #self.log('item page url is ==== ' + response.url) moivelist = response.xpath("//li[@class='clearfix']") for m in moivelist: list = ScrapyDemoItem() list['cover'] = m.xpath('./a/img/@src')[0].extract() list['title'] = m.xpath('./a/@title')[0].extract() list['dec'] = m.xpath( "./div/div[@class='index-intro']/a/text()").extract()[0] list['playUrl'] = 'https://www.vmovier.com/' + m.xpath( './@data-id').extract()[0] + '?from=index_new_img' #print(list) #yield item lists.append(list) return lists
def parse_moive(self, response): item = ScrapyDemoItem() item['cover'] = response.meta['cover'] item['title'] = response.meta['title'] item['dec'] = response.meta['dec'] item['playUrl'] = response.xpath(".//div[@class='p00b204e980']/p/iframe/@src")[0].extract() yield item #''' #parse(response) #parser 方法是Scrapy处理下载的response的默认方法。它同样必须被实现。 #parse 主要负责处理 response 并返回处理的数据以及跟进的URL。 #该方法及其他的Request回调函数必须返回一个包含 Request 及(或) Item 的可迭代的对象。 #'''
def parse(self, response): self.log('item page url is ==== ' + response.url) moivelist = response.xpath("//li[@class='clearfix']") for m in moivelist: item = ScrapyDemoItem() item['cover'] = m.xpath('./a/img/@src')[0].extract() item['title'] = m.xpath('./a/@title')[0].extract() item['dec'] = m.xpath( "./div/div[@class='index-intro']/a/text()")[0].extract() # print(item) urlitem = m.xpath('./a/@href')[0].extract() url = response.urljoin(urlitem) # 如果你想将上面的 item 字段传递给 parse_moive, 使用 meta 参数 yield scrapy.Request(url, callback=self.parse_moive, meta={ 'cover': item['cover'], 'title': item['title'], 'dec': item['dec'], })
def parse_detail(self, response): a_item = ScrapyDemoItem() a_item["title"] = response.css('.ProductName-primary::text').get() a_item["price"] = response.css('.ProductPrice-final::text').get() a_item["size"] = response.css('.c-form-label-content::text').get() yield a_item
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = ScrapyDemoItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] yield item