コード例 #1
0
    def parse_product(self, response):
        name = response.xpath("//h1[@itemprop='name']/text()").extract()[0]
        price = response.css('span.ProductPrice::text').extract()
        cats = response.xpath("//div[@id='ProductBreadcrumb']/ul/li//text()").extract()[1:-1]
        brand = ''.join(response.xpath("//*[@itemprop='brand']//text()").extract()).strip()
        if not brand:
            raise ValueError('brand')
        shipping_cost = 5
        sku = response.xpath("//*[@itemprop='sku']/text()").extract()[0].strip()
        identifier = response.xpath("//input[@name='product_id']/@value").extract()[0]
        image_url = response.xpath("//*[@itemprop='image']/@src").extract()[0]

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price.pop())
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)
        loader.add_value('shipping_cost', shipping_cost)

        yield loader.load_item()
コード例 #2
0
ファイル: visiondirect.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        soup = BeautifulSoup(response.body)

        # product list page
        products = soup.findAll('a', {'class': 'products-list__item'})
        if products:
            for r in self.parse_category(response):
                yield r
            return
        # discontinued product
        discontinued = response.xpath(
            "//div[contains(@class, 'discontinued')]")
        if not discontinued:
            discontinued = 'Discontinued Product' in response.body
        if discontinued:
            return

        name = response.xpath("//h1[@itemprop='name']/text()").extract()
        if not name:
            name = soup.find('h1', {'itemprop': 'name'}).text
        price = re.findall(
            '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",',
            response.body_as_unicode())[0]
        stock = None
        brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re(
            'by&nbsp;(.*)')
        if not brand:
            brand = soup.find('span', {
                'itemprop': 'manufacturer'
            }).text.split('by&nbsp;')[-1].strip()
        sku = re.search('"sku":"([^"]*)","product_id"',
                        response.body_as_unicode()).group(1)
        identifier = re.search('"product_id":"([^"]*)"',
                               response.body_as_unicode()).group(1)
        image_url = response.xpath("//img[@class='prod-image']/@src").extract()
        if not image_url:
            image_url = soup.find('img', {'itemprop': 'image'})['src']
        cats = []
        for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]:
            cats.append(''.join(el.xpath('.//text()').extract()).strip())

        shipping_cost = '2.98' if float(price) < 49 else '0'

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)
        loader.add_value('shipping_cost', shipping_cost)

        yield loader.load_item()
コード例 #3
0
    def parse_product(self, response):
        name = response.xpath("//h2/span[@itemprop='name']/text()").extract()
        if not name:
            name = response.xpath("//table//tr/td//h2/text()").extract()
        name = name[0]
        price = response.xpath("//span[@itemprop='price']/text()").re('[\d\.]+')
        if not price:
            price = response.xpath("//span[@class='pr-price']/strong/text()").re('[\d\.]+')
        price = price[0]
        stock = response.xpath("//*[@itemprop='availability']/@href").extract()
        if stock:
            if 'InStock' in stock[0]:
                stock = None
            else:
                stock = 0
        else:
            stock = None

        cats = response.xpath("//div[@class='grid_10']/h1/a/text()").extract()
        brand = cats[-1]
        image_url = response.xpath("//img[@alt='{}']/@src".format(name)).extract()
        m = re.search("details(.*)\.html", response.url)
        if m:
            identifier = m.group(1)
        else:
            entryid = url_query_parameter(response.url, 'entryid')
            priceid = url_query_parameter(response.url, 'priceid')
            if not entryid or not priceid:
                raise KeyError("Not found entryid and priceid in url: {}".format(response.url))
            identifier = entryid + priceid
        sku = identifier

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)

        yield loader.load_item()
コード例 #4
0
    def parse_product(self, response):
        soup = BeautifulSoup(response.body)
        if not soup.find('div', attrs={'class': 'product'}):
            retry_request = _retry_page(response)
            if retry_request:
                yield retry_request
            else:
                self.log(
                    "Error parsing page, couldn't extract product name: %s" %
                    response.url)
            return
        main_name = soup.find('div', attrs={'class': 'product'}).h1.text
        main_name = remove_entities(main_name)
        brand_el = soup.find(
            lambda tag: tag.name == 'td' and 'brand' in tag.text.lower())
        brand = brand_el.findNextSibling('td').text.strip() if brand_el else ''
        cat_names = [
            span.a.text
            for span in soup.find('div', attrs={
                'class': 'breadcrumbtrail'
            }).span.findAll('span') if span.a
        ][2:]
        image_url = soup.find('img', {'itemprop': 'image'})
        image_url = image_url['src'] if image_url else None

        table = soup.find('table', id='responsive-table')
        options = soup.findAll('div', attrs={'class': 'option'})
        if table:
            for row in table.findAll('tr'):
                # Skip head row
                if not row.td:
                    continue

                name = row.find('span', attrs={'class': 'name'}).text
                name = remove_entities(name)
                if not _main_name_in_opt_name(main_name, name):
                    name = main_name + ' ' + name
                identifier = row.find('span', attrs={'class': 'codenumber'})
                if not identifier:
                    self.errors.append(
                        "Identifier not found for products on page: %s" %
                        response.url)
                    continue
                identifier = identifier.text

                price = row.find(_is_price_tag).text
                real_price = extract_price(price)
                if real_price < 15:
                    shipping_cost = 3
                elif real_price < 40:
                    shipping_cost = 4
                elif real_price < 130:
                    shipping_cost = 7
                else:
                    shipping_cost = None

                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()
        elif options:
            main_id = response.url.split('.')[-2].split('p-')[-1]
            price = soup.find('span', attrs={'class': 'inctax'}).span.text
            real_price = extract_price(price)
            if real_price < 15:
                shipping_cost = 3
            elif real_price < 40:
                shipping_cost = 4
            elif real_price < 130:
                shipping_cost = 7
            else:
                shipping_cost = None

            results = {}
            for opt in options:
                opt_name = opt.label.span.text
                results[opt_name] = []
                for subopt in opt.select.findAll('option'):
                    subopt_name = subopt.text
                    subopt_value = _soup_el_get_attr(subopt, 'value')
                    if subopt_value == '0':
                        continue
                    results[opt_name].append({
                        'id':
                        remove_entities(subopt_name).replace('"', ''),
                        'name':
                        opt_name + ': ' + subopt_name
                    })
            for opt_tuple in product(*results.values()):
                name = _build_opt_name(main_name, opt_tuple)
                identifier = _build_opt_id(main_id, opt_tuple)
                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()
コード例 #5
0
ファイル: uktoolcentre.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        try:
            # fall back to Beautiful Soup
            soup = BeautifulSoup(response.body)
            hxs = HtmlXPathSelector(response)

            container = soup.find('div', attrs={'class': 'nosto_product'})

            brand = container.find('span', attrs={'class': 'brand'}).text
            cat_names = [el.text for el in soup.find("div", id='bct').findAll('a')][1:]
            main_id = container.find('span', attrs={'class': 'product_id'}).text
            availability = container.find('span', attrs={'class': 'availability'}).text
            image_url = soup.find('img', id='main-image').attrMap['src']

            options = soup.find('table', id='sku-table')
            if not options:
                name = soup.find('div', id='product-page-info').find('h1').text
                price = container.find('span', attrs={'class': 'price'}).text

                loader = ProductLoaderWithNameStrip(Product(), selector=hxs)
                loader.add_value('brand', brand)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('name', name)
                loader.add_value('identifier', main_id)
                loader.add_value('price', price)
                loader.add_value('url', response.url)
                loader.add_value('sku', main_id)
                loader.add_value('image_url', image_url)

                if availability.lower() == 'outofstock':
                    loader.add_value('stock', 0)

                yield loader.load_item()
            else:
                option_ids = []
                for opt in options.findAll('tr'):
                    sec_id = opt.findAll('td')[1].find('small').text
                    name = opt.findAll('td')[1].text.replace(sec_id, '')
                    sec_id = sec_id.strip('(').strip(')')
                    identifier = main_id + ':' + sec_id
                    volts = get_volts_from_name(name)
                    if volts is not None:
                        identifier = identifier + ':' + volts
                    pack_of = get_pack_of_from_name(name)
                    if pack_of is not None:
                        identifier = identifier + ':' + pack_of

                    if identifier in option_ids:
                        option_id = opt.find('input', attrs={'name': 'ID'}).get('value')
                        identifier = identifier + ':' + option_id

                    option_ids.append(identifier)

                    price = opt.find('td', attrs={'class': 'price'}).text.strip(u'\xa3').strip('&pound;')

                    loader = ProductLoaderWithNameStrip(Product(), response=response)
                    loader.add_value('brand', brand)
                    for cat_name in cat_names:
                        loader.add_value('category', cat_name)
                    loader.add_value('name', name)
                    loader.add_value('identifier', identifier)
                    loader.add_value('price', price)
                    loader.add_value('url', response.url)
                    loader.add_value('sku', main_id)
                    loader.add_value('image_url', image_url)

                    if availability.lower() == 'outofstock':
                        loader.add_value('stock', 0)
                    
                    yield loader.load_item()

        except IndexError as e:
            # try loading page again
            tries = response.meta.get('try', 0)
            if tries < 10:
                yield Request(response.url, callback=self.parse_product, dont_filter=True, meta={'try': tries + 1})
            else:
                self.errors.append("Error scraping page %s: %s" % (response.url, str(e)))
                raise