Exemple #1
0
	def parse(self, response):
		self.log('item page url is ==== ' + response.url)
		
		moivelist = response.xpath("//li[@class='clearfix']")
		
		for m in moivelist:
			item = ScrapyDemoItem()
			item['cover'] = m.xpath('./a/img/@src')[0].extract()
			item['title'] = m.xpath('./a/@title')[0].extract()
			item['dec'] = m.xpath("./div/div[@class='index-intro']/a/text()").extract()[0]
			#print(item)
			#yield item
			'''
			添加个[0], 因为 xpath() 返回的结果是列表类型。
			如果不添加,运行结果会返回一个列表,而不是文本信息。
			'''
			
			
		    #item['playUrl'] = response.xpath("//*[@id='iframeId']/@src")[0].extract()
		    #item['playUrl'] = 'https://www.vmovier.com/' + response.xpath("//*[@id='post-list']/li/@data-id")[0] + '?from=index_new_img'
		
			
		    #为什么要在 [0] 后面添加 extract()方法,这里涉及到内建选择器 Selecter 的知识。
		    # extract()方法的作用是串行化并将匹配到的节点返回一个unicode字符串列表。把html 标签去掉

			# 提取电影详细页面 url 地址
			urlitem = m.xpath('./a/@href')[0].extract()
			url = response.urljoin(urlitem)

			yield scrapy.Request(url, callback=self.parse_moive, meta={
				'cover': item['cover'],
				'title': item['title'],
				'dec': item['dec'],
			})
Exemple #2
0
    def parse_item(self, response):
        """ This function parse_item a property page.

        @url http://172.28.128.1:9312/properties/index_00000.html
        @returns items 1
        @scrapes title price description address images
        @scrapes url project spider server date        
        """

        ld = ItemLoader(item=ScrapyDemoItem(), response=response)
        ld.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                     MapCompose(str.strip, str.title))
        ld.add_xpath('price',
                     '//*[@itemprop="price"][1]/text()',
                     MapCompose(lambda i: i.replace(',', ''), float),
                     re='[,.0-9]+')
        ld.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                     MapCompose(str.strip), Join())
        ld.add_xpath('address',
                     '//*[@itemtype="http://schema.org/Place"][1]/text()',
                     MapCompose(str.strip))
        ld.add_xpath(
            'images', '//*[@itemprop="image"][1]/@src',
            MapCompose(lambda i: urllib.parse.urljoin(response.url, i)))

        # 使用add_value添加单个值
        ld.add_value('url', response.url)
        ld.add_value('project', self.settings.get('BOT_NAME'))
        ld.add_value('spider', self.name)
        ld.add_value('server', socket.gethostname())
        ld.add_value('date', datetime.datetime.now())

        return ld.load_item()
Exemple #3
0
    def parse(self, response):
        # print(response.body)
        it = ScrapyDemoItem()
        it['json'] = response.body
        yield it

        yield Request(self.url, callback=self.parse, dont_filter=True)
        pass
    def parse_moive(self, response):

        item = ScrapyDemoItem()
        item['cover'] = response.meta['cover']
        item['title'] = response.meta['title']
        item['dec'] = response.meta['dec']
        item['playUrl'] = response.xpath(
            "//div[@class='p00b204e980']/p/iframe/@src")[0].extract()
        yield item
Exemple #5
0
    def parse(self, response):

        items = []
        for each in response.xpath("//div[@class='li_txt']"):
            item = ScrapyDemoItem()

            item['name'] = each.xpath("h3/text()").extract()[0]
            item['level'] = each.xpath("h4/text()").extract()[0]
            item['info'] = each.xpath("p/text()").extract()[0]

            # items.append(item)
            yield item
Exemple #6
0
 def parse(self, response):
     item = ScrapyDemoItem()
     item['title'] = response.xpath('//title/text()').get()
     item['content'] = response.xpath(
         '//div[@itemprop="articleBody"]//p/text()').getall()
     title = response.xpath('//title/text()').get()
     content = response.xpath(
         '//div[@itemprop="articleBody"]//p/text()').getall()
     #yield item
     print(title)
     print(content)
     yield item
Exemple #7
0
    def parse(self, response):
        lists = []
        #self.log('item page url is ==== ' + response.url)

        moivelist = response.xpath("//li[@class='clearfix']")

        for m in moivelist:
            list = ScrapyDemoItem()
            list['cover'] = m.xpath('./a/img/@src')[0].extract()
            list['title'] = m.xpath('./a/@title')[0].extract()
            list['dec'] = m.xpath(
                "./div/div[@class='index-intro']/a/text()").extract()[0]
            list['playUrl'] = 'https://www.vmovier.com/' + m.xpath(
                './@data-id').extract()[0] + '?from=index_new_img'
            #print(list)
            #yield item
            lists.append(list)
        return lists
Exemple #8
0
	def parse_moive(self, response):
		item = ScrapyDemoItem()
		item['cover'] = response.meta['cover']
		item['title'] = response.meta['title']
		item['dec'] = response.meta['dec']
		item['playUrl'] = response.xpath(".//div[@class='p00b204e980']/p/iframe/@src")[0].extract()	
		yield item



		    #'''
		    #parse(response)

            #parser 方法是Scrapy处理下载的response的默认方法。它同样必须被实现。
		    #parse 主要负责处理 response 并返回处理的数据以及跟进的URL。
		    #该方法及其他的Request回调函数必须返回一个包含 Request 及(或) Item 的可迭代的对象。
            #'''
			
			
			
    def parse(self, response):
        self.log('item page url is ==== ' + response.url)

        moivelist = response.xpath("//li[@class='clearfix']")

        for m in moivelist:
            item = ScrapyDemoItem()
            item['cover'] = m.xpath('./a/img/@src')[0].extract()
            item['title'] = m.xpath('./a/@title')[0].extract()
            item['dec'] = m.xpath(
                "./div/div[@class='index-intro']/a/text()")[0].extract()
            # print(item)

            urlitem = m.xpath('./a/@href')[0].extract()
            url = response.urljoin(urlitem)
            # 如果你想将上面的 item 字段传递给 parse_moive, 使用 meta 参数
            yield scrapy.Request(url,
                                 callback=self.parse_moive,
                                 meta={
                                     'cover': item['cover'],
                                     'title': item['title'],
                                     'dec': item['dec'],
                                 })
    def parse_detail(self, response):
	a_item = ScrapyDemoItem()
        a_item["title"] = response.css('.ProductName-primary::text').get()
        a_item["price"] = response.css('.ProductPrice-final::text').get()
        a_item["size"] = response.css('.c-form-label-content::text').get()        
        yield a_item 
Exemple #11
0
 def parse(self, response):
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for each_movie in movies:
         item = ScrapyDemoItem()
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
         yield item