コード例 #1
0
    def parse_profile(self, response):
        item = response.meta.get('item')

        email = response.xpath(
            '//a[contains(@href,"mailto")]/span/text()[contains(., "@")]'
        ).extract()
        if email:
            email = email[0].strip()
        else:
            return None
        cond_set(item, 'name',
                 response.xpath(
                     "//div[@class='profile-info']/div/div/span/text()"
                 ).extract(), string.strip)
        cond_set_value(item, 'email', email)
        cond_set_value(item, 'reviewer', response.url)
        cond_set_value(item, 'country', self.COUNTRY)

        return item
コード例 #2
0
    def parse_profile(self, response):
        item = response.meta.get('item')

        email = response.xpath(
            '//a[contains(@href,"mailto")]/span/text()'
        ).extract()
        if email:
            email = email[0].strip()
        else:
            return None
        cond_set(item, 'name',
                 response.xpath(
                     "//div[@class='profile-info']/div/div/span/text()"
                 ).extract(), string.strip)
        cond_set_value(item, 'email', email)
        cond_set_value(item, 'reviewer', response.url)
        cond_set_value(item, 'country', self.COUNTRY)

        return item
コード例 #3
0
    def _scrape_product_links(self, response):
        products = response.xpath('//li[@class="s-result-item"]')

        for pr in products:
            if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")] |'
                        './/h5[contains(text(), "Sponsored")]'):
                continue
            product = ProductItem()

            cond_set(product, 'title', pr.xpath('.//h2/../@title').extract())

            cond_set(product, 'product_image',
                     pr.xpath('.//img[@alt="Product Details"]/@src').extract())

            cond_set(
                product, 'brand',
                pr.xpath('.//div[@class="a-fixed-left-grid-col a-col-right"]'
                         '/div/div/span[2]/text() |'
                         './/div[@class="a-row a-spacing-mini"]/span[2]/text()'
                         ).extract())

            cond_set(
                product, 'price',
                pr.xpath(
                    './/span[contains(@class,"s-price")]/text()').extract())

            cond_set(product, 'asin', pr.xpath('@data-asin').extract())

            if pr.xpath('.//i[contains(@class, "a-icon-prime")]'):
                cond_set_value(product, 'prime', True)
            else:
                cond_set_value(product, 'prime', False)

            cond_set(
                product, 'shipping_price',
                pr.xpath(
                    './/span[contains(@class,"s-price")]/'
                    'following::span[2]/text()').re('(\d+.?\d+) shipping'))

            new = pr.xpath('.//a[contains(text(),"new")]/span/text()')

            if new:
                cond_set(product, 'new_price', new.extract())
                cond_set(product, 'new_offers', new[1].re('\d+'))

            used = pr.xpath('.//a[contains(text(),"used")]/span/text()')

            if used:
                cond_set(product, 'used_price', used.extract())
                cond_set(product, 'used_offers', used[1].re('\d+'))

            cond_set(
                product, 'rating',
                pr.xpath('.//span[contains(@name,"' + product['asin'] +
                         '")]/span/a/i/span').re('(\d+.?\d+)'))

            cond_set(
                product, 'number_of_reviews',
                pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/'
                         'following::a[1]/text()').re('([\d+,?]+\d+)'))

            category = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/text()').re('(.*):')

            if not category:
                category = response.xpath(
                    '//div[@id="autoscoping-backlink"]/div/span/span/text()'
                ).extract()

            cond_set(product, 'category', category)

            number_of_items = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/../text()').re(
                    '([\d+,?]+\d+)')

            if number_of_items:
                cond_set_value(product, 'number_of_items', number_of_items[0])
            else:
                cond_set_value(product, 'number_of_items',
                               response.meta.get('total_matches'))

            product['all_brands'] = response.xpath(
                '//h2[text()="Brand"]/following::ul[1]/'
                'li[@class="refinementImage"]/a/span/text()').extract()

            yield product
コード例 #4
0
ファイル: amazon.py プロジェクト: Ksynko/amazon_spider
    def _scrape_product_links(self, response):
        products = response.xpath('//li[@class="s-result-item"]')

        for pr in products:
            if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")]'):
                continue
            product = ProductItem()

            cond_set(product, 'title',
                     pr.xpath('.//h2/../@title').extract())

            cond_set(product, 'product_image',
                     pr.xpath('.//img[@alt="Product Details"]/@src').extract())

            cond_set(product, 'brand',
                     pr.xpath(
                         './/div[@class="a-fixed-left-grid-col a-col-right"]'
                         '/div/div/span[2]/text()').extract())

            cond_set(product, 'price',
                     pr.xpath(
                        './/span[contains(@class,"s-price")]/text()'
                     ).extract())

            cond_set(product, 'asin', pr.xpath('@data-asin').extract())

            if pr.xpath('.//i[contains(@class, "a-icon-prime")]'):
                cond_set_value(product, 'prime', True)
            else:
                cond_set_value(product, 'prime', False)

            cond_set(product, 'shipping_price', pr.xpath(
                './/span[contains(@class,"s-price")]/'
                'following::span[2]/text()').re('(\d+.?\d+) shipping'))

            new = pr.xpath('.//a[contains(text(),"new")]/span/text()')

            if new:
                cond_set(product, 'new_price', new.extract())
                cond_set(product, 'new_offers', new[1].re('\d+'))

            used = pr.xpath('.//a[contains(text(),"used")]/span/text()')

            if used:
                cond_set(product, 'used_price', used.extract())
                cond_set(product, 'used_offers', used[1].re('\d+'))

            cond_set(product, 'rating', pr.xpath(
                './/span[contains(@name,"'+product['asin']+'")]/span/a/i/span'
            ).re('(\d+.?\d+)'))

            cond_set(product, 'number_of_reviews', pr.xpath(
                './/span[contains(@name,"'+product['asin']+'")]/'
                'following::a[1]/text()').re('([\d+,?]+\d+)'))

            cond_set(product, 'category', pr.xpath(
                './/span[contains(@class,"a-text-bold")]/text()'
            ).re('(.*):'))

            number_of_items = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/../text()'
            ).re('([\d+,?]+\d+)')

            if number_of_items:
                cond_set_value(product, 'number_of_items', number_of_items[0])

            # product['url'] = pr.xpath('.//h2/../@href')[0].extract()
            # cond_set(product, 'url', pr.xpath('.//h2/../@href').extract())
            yield product