Example #1
0
    def parse_item(self, response):
        """
        @url http://www.fahasa.com/luat-im-lang-mario-puzo.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_value('name',
                    l.get_xpath('//*[@class="product-name"]/h1/text()')[-1])
        l.add_value(
            'name_unidecode',
            unidecode(l.get_xpath('//*[@class="product-name"]/h1/text()')[-1]))
        l.add_value('price',
                    l.get_xpath('//*[@class="price"]/text()')[1].strip(),
                    TakeFirst(),
                    re=r'\d+\.\d+')
        l.add_value(
            'description',
            filter(None, [
                re.sub('<[^<]+?>', '', i)
                for i in l.get_xpath('//*[@class="std"]')
            ]), Join('\n'))
        l.add_xpath('image_uri', '//*[@id="image"]/@src')

        # Information fields
        l.add_value('url', response.url[response.url.find('cache:') + 6:])
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Example #2
0
    def parse_item(self, response):
        """
        @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1])
        l.add_value(
            'name_unidecode',
            unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1]))
        l.add_xpath('price',
                    '//*[contains(@id, "discounted_price")]/span/text()',
                    TakeFirst())
        l.add_xpath('author', '//*[@itemprop="author"]/text()')
        l.add_value(
            'description',
            filter(None, [
                re.sub('<[^<]+?>', '', i)
                for i in l.get_xpath('//*[@class="full-description"]/p')
            ]), Join('\n'))
        l.add_xpath('image_uri', '//*[@itemprop="image"]/@src')

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Example #3
0
    def parse_item(self, response):
        """
        @url https://tiki.vn/hieu-ng-canh-buom-p146105.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_xpath('name', '//*[@class="item-name"]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('name_unidecode', '//*[@class="item-name"]/text()',
                    MapCompose(unidecode, str.strip, str.title))
        l.add_xpath('author', '//*[@class="item-brand"]/p/a/text()')
        l.add_xpath('price',
                    '//*[@id="span-price"]/text()',
                    TakeFirst(),
                    re=r'\d+\.\d+')
        l.add_value('description', [
            re.sub('<[^<]+?>', '', i)
            for i in l.get_xpath('//*[@id="gioi-thieu"]/p')
        ], Join('\n'))
        l.add_xpath('image_uri', '//*[@itemprop="image"]/@src')

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Example #4
0
    def parse_item(self, response):
        """
        @url http://www.lazada.vn/tony-buoi-sang-tren-duong-bang-1540897.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_xpath('name', '//*[@id="prod_title"]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('name_unidecode', '//*[@id="prod_title"]/text()',
                    MapCompose(unidecode, str.strip, str.title))
        l.add_xpath('price', '//*[@id="special_price_box"]/text()')
        l.add_value(
            'description',
            re.sub('<[^<]+?>', '',
                   l.get_xpath('//*[@class="product-description__block"]')
                   [0]).strip())
        l.add_value('image_uri',
                    l.get_xpath('//*[@itemprop="image"]/@content')[1])

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()