Esempio n. 1
0
    def parse_item(self, response):
        """This function parses a property page.
                @url http://192.168.56.1:9312/properties/property_000000.html
                @returns items 1
                @scrapes title price description address image_urls
                @scrapes url project spider server date
                """
        # Create the loader using the response
        i = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        i.add_xpath("title", '//*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        i.add_xpath('price',
                    '//*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[. 0-9]+')
        i.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(lambda i: i.replace('\r\n', ' '), str.strip),
                    Join())
        i.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()')
        i.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: parse.urljoin(response.url, i)))

        # Housekeeping fields
        i.add_value('url', response.url)
        i.add_value('project', self.settings.get('BOT_NAME'))
        i.add_value('spider', self.name)
        i.add_value('server', socket.gethostname())
        i.add_value('date', datetime.datetime.now())

        # logging.debug("日志")

        return i.load_item()
Esempio n. 2
0
    def parse(self, response):
        i = ItemLoader(item=Garbarino1Item() ,response=response)
        
        i.add_xpath('title','//*[@class="gb-main-detail-title"][1]/h1/text()',MapCompose(unicode.strip, unicode.title))
        i.add_xpath('price','//*[@class="gb-main-detail-prices-current"][1]/text()',MapCompose(lambda i: i.replace(',', ''), float),re='[,.0-9]+')
        i.add_xpath('description','/html/body/div[3]/div[1]/div[1]/h2/text()',MapCompose(unicode.strip), Join())
        i.add_xpath('image_urls','//*[@id="main-image"][1]/@src',MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        i.add_value('url', response.url)
        i.add_value('project', self.settings.get('BOT_NAME'))
        i.add_value('spider', self.name)
        i.add_value('server', socket.gethostname())
        i.add_value('date', datetime.datetime.now())

        return i.load_item()