Python BeautifulSoup.findAll Examples, product_spiders.spiders.BeautifulSoup.BeautifulSoup.findAll Python Examples

Example #1

0

Show file

File: rvpartscenter.py Project: 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()

Example #2

0

Show file

File: rvpartscenter.py Project: ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()

Example #3

0

Show file

File: amazon_spider_us.py Project: oceancloud82/scraping

    def parse(self, response):
        soup = BeautifulSoup(response.body)
        next_page = soup.find('a', 'pagnNext')
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page['href'])
            yield Request(next_page, meta=response.meta)

        hxs = HtmlXPathSelector(response)

        next_page = hxs.select('//a[@id="pagnNextLink"]/@href').extract()
        if next_page:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta)

        products = soup.findAll('div', id=re.compile(u'^result_.*'))
        for product in products:
            # parent_expressions = (lambda tag: tag.name == 'h3' and tag.get('class') == 'title',
            #                      lambda tag: tag.name == 'div' and tag.get('class') == 'productTitle')
            url = product.find('h3', 'newaps').find('a') if product.find(
                'h3', 'newaps') else ''
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
                yield Request(url,
                              meta=response.meta,
                              callback=self.parse_options)

        for result in hxs.select(
                u'//div[@id="atfResults" or @id="btfResults"]//div[starts-with(@id, "result_")]'
        ):
            try:
                url = result.select(u'.//h3/a/@href').extract()[0]
            except:
                continue
            yield Request(url, meta=response.meta, callback=self.parse_options)

Example #4

0

Show file

File: shoemetro_spider.py Project: oceancloud82/scraping

 def parse_items(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     cur_page = hxs.select('//span[@class="currentPage"]/text()').extract()
     if cur_page and (int(cur_page[0]) != response.meta['cur']) and (
             response.meta['attempt'] < 5):
         log.msg('WRONG PAGE! ONE MORE ATTEMPT to ' + response.url)
         yield Request(response.url + '&at=' +
                       str(response.meta['attempt']),
                       meta={
                           'cur': response.meta['cur'],
                           'attempt': response.meta['attempt'] + 1
                       },
                       dont_filter=True,
                       callback=self.parse_items)
         return
     soup = BeautifulSoup(response.body)
     products = [
         a['href'] for a in
         soup.findAll(lambda tag: tag.name == 'a' and tag.findChild('b') and
                      tag.findParent('td', {'colspan': 2}))
     ]
     for url in products:
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, callback=self.parse_product)
     """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')

Example #5

0

Show file

    def parse_brand(self, response):
        hxs = HtmlXPathSelector(response)
        # if nothing found try to reload page
        if hxs.select('//div[@class="detailPageTitle"][text()="Viewing 0"]'):
            req = self.retry(response)
            if req:
                yield req
            return

        soup = BeautifulSoup(response.body)

        products = hxs.select('//ul[@class="stockthumbwrapper"]')
        for p in products:
            url = p.xpath(
                './/li[@class="productThumbName"]/a/@href')[0].extract()
            meta = response.meta.copy()
            promo = p.xpath(
                './/li[@class="productThumbImage"]//img[contains(@class,"cornerImgFormat2 discount")]/@alt'
            ).extract()
            meta['promotions'] = promo[0] if promo else ''
            yield Request(urljoin(get_base_url(response), url),
                          callback=self.parse_product,
                          meta=response.meta)

        for p in soup.findAll('ul', 'stockthumbwrapper'):
            url = p.find('li', 'productThumbName').find('a')['href']
            meta = response.meta.copy()
            promo = p.find('li', 'productThumbImage').find(
                'img',
                attrs={'class': re.compile('cornerImgFormat2 discount')})
            meta['promotions'] = promo['alt'] if promo else ''
            yield Request(urljoin(get_base_url(response), url),
                          callback=self.parse_product,
                          meta=meta)

        pages = soup.findAll('div', id='pagenumber')
        if pages:
            for page in set(pages[0].findAll('a')):
                yield Request(response.urljoin(page),
                              meta=response.meta,
                              callback=self.parse_brand)

        for page in set(
                hxs.select('//div[@id="pagenumber"][1]/a/@href').extract()):
            yield Request(response.urljoin(page),
                          meta=response.meta,
                          callback=self.parse_brand)

Example #6

0

Show file

File: visiondirect.py Project: oceancloud82/scraping

    def parse_product(self, response):
        soup = BeautifulSoup(response.body)

        # product list page
        products = soup.findAll('a', {'class': 'products-list__item'})
        if products:
            for r in self.parse_category(response):
                yield r
            return
        # discontinued product
        discontinued = response.xpath(
            "//div[contains(@class, 'discontinued')]")
        if not discontinued:
            discontinued = 'Discontinued Product' in response.body
        if discontinued:
            return

        name = response.xpath("//h1[@itemprop='name']/text()").extract()
        if not name:
            name = soup.find('h1', {'itemprop': 'name'}).text
        price = re.findall(
            '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",',
            response.body_as_unicode())[0]
        stock = None
        brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re(
            'by&nbsp;(.*)')
        if not brand:
            brand = soup.find('span', {
                'itemprop': 'manufacturer'
            }).text.split('by&nbsp;')[-1].strip()
        sku = re.search('"sku":"([^"]*)","product_id"',
                        response.body_as_unicode()).group(1)
        identifier = re.search('"product_id":"([^"]*)"',
                               response.body_as_unicode()).group(1)
        image_url = response.xpath("//img[@class='prod-image']/@src").extract()
        if not image_url:
            image_url = soup.find('img', {'itemprop': 'image'})['src']
        cats = []
        for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]:
            cats.append(''.join(el.xpath('.//text()').extract()).strip())

        shipping_cost = '2.98' if float(price) < 49 else '0'

        loader = ProductLoaderWithNameStrip(Product(), response=response)

        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('image_url', image_url)
        loader.add_value('category', cats)
        loader.add_value('shipping_cost', shipping_cost)

        yield loader.load_item()

Example #7

0

Show file

File: visiondirect.py Project: oceancloud82/scraping

    def parse_category(self, response):
        soup = BeautifulSoup(response.body)
        products = soup.findAll('a', {'class': 'products-list__item'})
        for product_url in products:
            yield Request(product_url['href'], callback=self.parse_product)

        identifier = re.search('"product_id":"([^"]*)"',
                               response.body_as_unicode())
        if not products and identifier:
            for item in self.parse_product(response):
                yield item

Example #8

0

Show file

File: visiondirect.py Project: oceancloud82/scraping

    def parse(self, response):
        soup = BeautifulSoup(response.body)

        categories = response.xpath(
            '//div[contains(@class, "menu")]/@data-href').extract()
        categories += response.xpath(
            '//ul[contains(@class, "menu")]//a/@href').extract()
        for cat_url in categories:
            yield Request(response.urljoin(cat_url),
                          callback=self.parse_category)

        categories = soup.findAll('a', {'class': 'link'})
        for cat_url in categories:
            yield Request(response.urljoin(cat_url['href']),
                          callback=self.parse_category)

Example #9

0

Show file

File: shoemetro_spider.py Project: 0--key/lib

 def parse_items(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     cur_page = hxs.select('//span[@class="currentPage"]/text()').extract()
     if cur_page and (int(cur_page[0]) != response.meta["cur"]) and (response.meta["attempt"] < 5):
         log.msg("WRONG PAGE! ONE MORE ATTEMPT to " + response.url)
         yield Request(
             response.url + "&at=" + str(response.meta["attempt"]),
             meta={"cur": response.meta["cur"], "attempt": response.meta["attempt"] + 1},
             dont_filter=True,
             callback=self.parse_items,
         )
         return
     soup = BeautifulSoup(response.body)
     products = [
         a["href"]
         for a in soup.findAll(
             lambda tag: tag.name == "a" and tag.findChild("b") and tag.findParent("td", {"colspan": 2})
         )
     ]
     for url in products:
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, callback=self.parse_product)
     """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')

Example #10

0

Show file

    def parse_product(self, response):
        soup = BeautifulSoup(response.body)
        if not soup.find('div', attrs={'class': 'product'}):
            retry_request = _retry_page(response)
            if retry_request:
                yield retry_request
            else:
                self.log(
                    "Error parsing page, couldn't extract product name: %s" %
                    response.url)
            return
        main_name = soup.find('div', attrs={'class': 'product'}).h1.text
        main_name = remove_entities(main_name)
        brand_el = soup.find(
            lambda tag: tag.name == 'td' and 'brand' in tag.text.lower())
        brand = brand_el.findNextSibling('td').text.strip() if brand_el else ''
        cat_names = [
            span.a.text
            for span in soup.find('div', attrs={
                'class': 'breadcrumbtrail'
            }).span.findAll('span') if span.a
        ][2:]
        image_url = soup.find('img', {'itemprop': 'image'})
        image_url = image_url['src'] if image_url else None

        table = soup.find('table', id='responsive-table')
        options = soup.findAll('div', attrs={'class': 'option'})
        if table:
            for row in table.findAll('tr'):
                # Skip head row
                if not row.td:
                    continue

                name = row.find('span', attrs={'class': 'name'}).text
                name = remove_entities(name)
                if not _main_name_in_opt_name(main_name, name):
                    name = main_name + ' ' + name
                identifier = row.find('span', attrs={'class': 'codenumber'})
                if not identifier:
                    self.errors.append(
                        "Identifier not found for products on page: %s" %
                        response.url)
                    continue
                identifier = identifier.text

                price = row.find(_is_price_tag).text
                real_price = extract_price(price)
                if real_price < 15:
                    shipping_cost = 3
                elif real_price < 40:
                    shipping_cost = 4
                elif real_price < 130:
                    shipping_cost = 7
                else:
                    shipping_cost = None

                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()
        elif options:
            main_id = response.url.split('.')[-2].split('p-')[-1]
            price = soup.find('span', attrs={'class': 'inctax'}).span.text
            real_price = extract_price(price)
            if real_price < 15:
                shipping_cost = 3
            elif real_price < 40:
                shipping_cost = 4
            elif real_price < 130:
                shipping_cost = 7
            else:
                shipping_cost = None

            results = {}
            for opt in options:
                opt_name = opt.label.span.text
                results[opt_name] = []
                for subopt in opt.select.findAll('option'):
                    subopt_name = subopt.text
                    subopt_value = _soup_el_get_attr(subopt, 'value')
                    if subopt_value == '0':
                        continue
                    results[opt_name].append({
                        'id':
                        remove_entities(subopt_name).replace('"', ''),
                        'name':
                        opt_name + ': ' + subopt_name
                    })
            for opt_tuple in product(*results.values()):
                name = _build_opt_name(main_name, opt_tuple)
                identifier = _build_opt_id(main_id, opt_tuple)
                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()

Example #11

0

Show file

File: visiondirect.py Project: oceancloud82/scraping

 def parse(self, response):
     # using beautiful soup since the html is broken and cannot be parsed with lxml
     soup = BeautifulSoup(response.body)
     urls = soup.findAll('a', {'class': 'products-list__item'})
     for url in urls:
         yield Request(url['href'], callback=self.parse_product, meta=response.meta)

Example #12

0

Show file

File: argonaut_liquor.py Project: 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES)
        # products = hxs.select(u'//div[@class="itemResultsRow"]')
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        for product in products:
            # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0]
            url = product.find('div', attrs={'class': 'itemTitle'}).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)
            # dropdown = product.select(u'.//select[@name="mv_order_item"]')
            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                try:
                    # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                    brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip()
                except AttributeError:
                    brand = u''
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip()
                try:
                    # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                    vintage_age = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'vintageAge'}).text.strip()
                except AttributeError:
                    vintage_age = u''
                # multiple_prices = product.select(u'.//td[@class="priceCell"]')
                multiple_prices = product.findAll('td', attrs={'class':'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()')
                    try:
                        price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip()
                    # if not price:
                        # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()')
                    # price = price[0].extract().strip()

                    # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract()
                    bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract()
                        bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                # dropdown = dropdown[0]
                # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip()
                title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip()
                # for option in dropdown.select(u'./option/text()').extract():
                for option in [option.text for option in dropdown.findAll('option')]:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    option = re.search(u'(.*?) \((.*)\)', option).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

Example #13

0

Show file

File: argonaut_liquor.py Project: ontiyonke/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
        # products = hxs.select(u'//div[@class="itemResultsRow"]')
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        for product in products:
            # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0]
            url = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)
            # dropdown = product.select(u'.//select[@name="mv_order_item"]')
            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                try:
                    # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                    brand = product.find('div', attrs={
                        'class': 'itemTitle'
                    }).find('a').find('span', attrs={
                        'class': 'brand'
                    }).text.strip()
                except AttributeError:
                    brand = u''
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                title = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'title'
                }).text.strip()
                try:
                    # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                    vintage_age = product.find('div',
                                               attrs={
                                                   'class': 'itemTitle'
                                               }).find('a').find(
                                                   'span',
                                                   attrs={
                                                       'class': 'vintageAge'
                                                   }).text.strip()
                except AttributeError:
                    vintage_age = u''
                # multiple_prices = product.select(u'.//td[@class="priceCell"]')
                multiple_prices = product.findAll('td',
                                                  attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()')
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    # if not price:
                    # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()')
                    # price = price[0].extract().strip()

                    # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract()
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract()
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                # dropdown = dropdown[0]
                # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip()
                # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip()
                brand = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'brand'
                }).text.strip()
                title = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'title'
                }).text.strip()
                # for option in dropdown.select(u'./option/text()').extract():
                for option in [
                        option.text for option in dropdown.findAll('option')
                ]:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    option = re.search(u'(.*?) \((.*)\)', option).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

Example #14

0

Show file

File: argonaut_liquor.py Project: oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        soup = BeautifulSoup(response.body,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
        try:
            products = soup.findAll('div', attrs={'class': 'itemResultsRow'})
        except AttributeError:
            products = []
        if not products:
            single_product = True
        else:
            single_product = False

        for product in products:
            url = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a')['href']
            url = urljoin_rfc(get_base_url(response), url)

            try:
                brand = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'brand'
                }).text.strip()
            except AttributeError:
                brand = u''
            title = product.find('div', attrs={
                'class': 'itemTitle'
            }).find('a').find('span', attrs={
                'class': 'title'
            }).text.strip()
            try:
                # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip()
                vintage_age = product.find('div', attrs={
                    'class': 'itemTitle'
                }).find('a').find('span', attrs={
                    'class': 'vintageAge'
                }).text.strip()
            except AttributeError:
                vintage_age = u''

            dropdown = product.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                multiple_prices = product.findAll('td',
                                                  attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    try:
                        sku = option.find('p',
                                          attrs={
                                              'class': 'priceCellP itemid'
                                          }).text.strip()
                    except AttributeError:
                        sku = ''
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})
                    if not bottle_size:
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                for option in dropdown.findAll('option'):
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    name = u'%s %s' % (brand, title)
                    # option = re.search(u'(.*?) \((.*)\)', option.text).groups()
                    option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$',
                                       option.text).groups()
                    name += u' %s' % option[1]
                    loader.add_value('name', name)
                    loader.add_value('price', option[0])
                    loader.add_value('sku', option[2])
                    if loader.get_output_value('price'):
                        yield loader.load_item()

        if single_product:
            url = response.url
            try:
                brand = soup.find('div', attrs={
                    'class': 'itemTitle'
                }).find('span', attrs={
                    'class': 'brand'
                }).text.strip()
            except AttributeError:
                brand = u''
            title = soup.find('div', attrs={
                'class': 'itemTitle'
            }).find('span', attrs={
                'class': 'title'
            }).text.strip()
            try:
                vintage_age = soup.find('div', attrs={
                    'class': 'itemTitle'
                }).find('span', attrs={
                    'class': 'vintageAge'
                }).text.strip()
            except AttributeError:
                vintage_age = u''
            dropdown = soup.find('select', attrs={'name': 'mv_order_item'})
            if not dropdown:
                multiple_prices = soup.find('div',
                                            attrs={
                                                'class': 'priceArea'
                                            }).findAll(
                                                'td',
                                                attrs={'class': 'priceCell'})
                for option in multiple_prices:
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    try:
                        price = option.find('p',
                                            attrs={
                                                'class':
                                                'priceCellP salePriceP'
                                            }).find('span',
                                                    attrs={
                                                        'class': 'priceSale'
                                                    }).text.strip()
                    except AttributeError:
                        price = option.find('p', attrs={
                            'class': 'priceCellP'
                        }).find('span', attrs={
                            'class': 'priceRetail'
                        }).text.strip()
                    try:
                        sku = option.find('p',
                                          attrs={
                                              'class': 'priceCellP itemid'
                                          }).text.strip()
                    except AttributeError:
                        sku = ''
                    bottle_size = option.find(
                        'p', attrs={'class': 'priceCellP priceUnit'})

                    if not bottle_size:
                        bottle_size = option.find(
                            lambda tag: tag.name == 'span' and tag.get(
                                'class', '') == 'priceUnit' and tag.findParent(
                                    'p', attrs={'class': 'priceCellP'}))
                    if bottle_size:
                        name += u' %s' % bottle_size.text.strip()
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()
            else:
                for option in dropdown.findAll('option'):
                    name = u'%s %s %s' % (brand, title, vintage_age)
                    option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$',
                                       option.text).groups()
                    price = option[0]
                    name += u' %s' % option[1].strip()
                    sku = option[2]

                    loader = ProductLoader(item=Product(), selector=option)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    if loader.get_output_value('price'):
                        yield loader.load_item()

Example #15

0

Show file

    def parse_product(self, response):
        hxs = HtmlXPathSelector(text=response.body_as_unicode())

        loader = ProductLoader(response=response, item=Product())

        loader.add_value('url', response.url)
        identifier = hxs.select('//input[@id="catentryId"]/@value').extract()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = ''.join(
            hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()'
                       ).extract()).strip()
        loader.add_value('price', price)

        categories = hxs.select(
            '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()'
        ).extract()[1:]
        loader.add_value('category', categories)

        image_url = hxs.select('//img[@id="productMainImage"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        brand = hxs.select(
            '//li[contains(text(), "BRAND")]/span/text()').extract()
        loader.add_value('brand', brand)

        item = loader.load_item()

        if not item.get('name'):
            log.msg('Using BeautifulSoup: ' + response.url)
            loader = ProductLoader(response=response, item=Product())
            soup = BeautifulSoup(response.body)

            loader.add_value('url', response.url)
            identifier = soup.find('input', attrs={'id': 'catentryId'})
            identifier = _soup_el_get_attr(identifier, 'value')
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            name = soup.find('h1', attrs={'itemprop': 'name'}).text
            loader.add_value('name', name)
            categories = [
                li.a.span.text
                for li in soup.find('ul', attrs={
                    'class': 'breadcrumbs'
                }).findAll('li') if li.a
            ][2:]
            loader.add_value('category', categories)
            price = soup.find('div', attrs={
                'itemprop': 'price'
            }).find('span', attrs={
                'class': 'price'
            }).text
            loader.add_value('price', price)

            image_url = soup.find('img', attrs={'id': 'productMainImage'})
            if image_url:
                image_url = _soup_el_get_attr(image_url, 'src')
                loader.add_value(
                    'image_url', urljoin_rfc(get_base_url(response),
                                             image_url))

            brand = ''
            for li in soup.findAll('li'):
                if 'BRAND' in li.text.upper():
                    brand = li.span.text
                    break

            loader.add_value('brand', brand)
            item = loader.load_item()
            if item['identifier']:
                yield item
        else:
            if item['identifier']:
                yield item

        if not item.get('name'):
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return

Example #16

0

Show file

    def parse_date(self, response):
        res = json.loads(response.body)
        if res['error']:
            return
        try:
            soup = BeautifulSoup(res['html'])
        except Exception:
            return

        all_prices = soup.findAll('td', {'class': 'table_desc'})
        adult_price = None
        child_price = None
        adult_ids = ['adult']
        child_ids = ['children', 'child', 'junior']
        excluded_ids = ['concession', 'student', 'infant', 'niño']

        remaining_prices = []
        for p in all_prices:
            if not adult_price and 'adult' in p.text.lower():
                adult_price = p.parent.findAll('td')[2].text
            elif not child_price and ('child' in p.text.lower()
                                      or 'junior' in p.text.lower()):
                child_price = p.parent.findAll('td')[2].text
            else:
                remaining_prices.append(p)

        if adult_price:
            loader = ProductLoader(item=Product(),
                                   selector=HtmlXPathSelector())
            loader.add_value(
                'identifier', response.meta['product_id'] + ':' +
                response.meta['date'] + ':Adult')
            loader.add_value('url', response.meta['url'])
            loader.add_value('sku', response.meta['date'])
            loader.add_value('category', response.meta['location'])
            loader.add_value('brand', 'Adult')
            loader.add_value('price', adult_price)
            loader.add_value('name', response.meta['name'])
            yield loader.load_item()
        if child_price:
            loader = ProductLoader(item=Product(),
                                   selector=HtmlXPathSelector())
            loader.add_value(
                'identifier', response.meta['product_id'] + ':' +
                response.meta['date'] + ':Child')
            loader.add_value('url', response.meta['url'])
            loader.add_value('sku', response.meta['date'])
            loader.add_value('category', response.meta['location'])
            loader.add_value('brand', 'Child')
            loader.add_value('price', child_price)
            loader.add_value('name', response.meta['name'])
            yield loader.load_item()

        for p in remaining_prices:
            exclude = False
            for t in excluded_ids:
                if t.decode('utf8') in p.text.lower():
                    exclude = True
                    break

            if exclude:
                continue

            ticket_type = 'Adult'
            for t in child_ids:
                if t in p.text.lower():
                    ticket_type = 'Child'

            loader = ProductLoader(item=Product(),
                                   selector=HtmlXPathSelector())
            option_name = p.text.lower()
            loader.add_value(
                'identifier', response.meta['product_id'] + ':' +
                response.meta['date'] + ':' + ticket_type + ':' + option_name)
            loader.add_value('url', response.meta['url'])
            loader.add_value('sku', response.meta['date'])
            loader.add_value('category', response.meta['location'])
            loader.add_value('brand', ticket_type)
            loader.add_value('price', p.parent.findAll('td')[2].text)
            loader.add_value('name', response.meta['name'] + ' - ' + p.text)
            if loader.get_output_value('price'):
                yield loader.load_item()