Beispiel #1
0
    def parse_product(self, response):
        product = response.meta['product']
        xpath = '//div[@class="product_disclaimer"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_set(product, 'brand',
                 response.css('.at-a-glance span::text').re('Brand (.+)'))
        cond_set_value(product, 'locale', u'en-GB')

        model = response.css('.product-detail::attr(partnumber)').extract()
        if not model:
            model = response.css('.product-detail::attr(partNumber)').extract()
        if not model:
            self.log('Could not find partNumber')
            return
        if model:
            product["model"] = is_empty(model)

        url = self.REVIEW_API_URL.format(model=model[0],
                                         apipass=self.REVIEW_API_PASS)
        if url:
            meta = {"product": product}
            return Request(
                url,
                callback=self.parse_buyer_reviews,
                dont_filter=True,
                meta=meta,
            )
        else:
            cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE)
        return product
Beispiel #2
0
 def _scrape_product_links(self, response):
     boxes = response.css('.product-description')
     for box in boxes:
         product = SiteProductItem()
         url = box.xpath('h3/a/@href').extract()
         cond_set(product, 'brand', box.xpath('p/text()').extract())
         yield url[0], product
Beispiel #3
0
 def _populate_from_html(self, response, product):
     title = response.xpath(
         '//h1[contains(@class, "search-prod-desc")]/text()'
         #'/@title'
     ).extract()
     cond_set(product, 'title', title)
     xpath = '//div[@id="dotcombrand"]/../preceding-sibling::li[1]/text()'
     brand = response.xpath(xpath).extract()
     if not brand:
         brand = response.xpath('//p[@class="brand-name"]/text()').extract()
         if brand:
             brand = brand[0].split(':')
             if len(brand) == 1:
                 brand = [brand[0]]
             else:
                 brand = [brand[1]]
     cond_set(product, 'brand', brand)
     xpath = '//h3[text()="Description"]' \
             '/following-sibling::p[normalize-space()] |' \
             '//div[contains(@class, "product-details-desc")]'
     desc = response.xpath(xpath).extract()
     cond_set(product, 'description', desc)
     image_url = re.findall("enlargedImageURL = '([^']*)'", response.body)
     cond_set(product, 'image_url', image_url)
     model = re.findall('"model" : "([^"]*)"', response.body)
     cond_set(product, 'model', model)
     regex = "currentSKUNbr=(\d+)"
     reseller_id = re.findall(regex, response.url)
     reseller_id = reseller_id[0] if reseller_id else None
     cond_set_value(product, "reseller_id", reseller_id)
     self._populate_related_products(response, product)
Beispiel #4
0
    def _populate_from_html(self, response, product):
        _populate_from_open_graph_product(response, product)
        cont = '#productDetailsLeftSidebar .inner-container '
        cond_set(product, 'title',
                 response.css(cont + 'h1::text').extract(), unicode.strip)
        if not product.get("title"):
            title = response.xpath(
                "//h1[contains(@class, 'prod_name')]/text()").extract()
            if title:
                cond_set(product, 'title', title, unicode.strip)

        regex = "\/_\/([^?$\s]+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        price = response.xpath(
            '//div[@id="productPrice"]' \
            '/div[contains(@class, "display_price")]/input/@value |'
            '//div[@id="productPrice"]/span[last()]/text()'
        ).extract()
        if price:
            price = price[0].replace("$", "").strip()
            product["price"] = Price(priceCurrency='USD', price=price)

        model = response.css('#storeStyleNumber::text').extract()
        if model:
            model = re.search(r'Store Style #:\xa0(.+)', model[0])
            cond_set_value(product, 'model', model,
                           lambda model: model.group(1))
        self._populate_related_products(response, product)

        self._populate_hardcoded_fields(product)
Beispiel #5
0
 def _populate_from_box(self, response, box, product):
     cond_set(product, 'title', box.css('.productTitle a::text').extract())
     cond_set(product, 'price',
              box.css('.currentPrice ins::text').extract(), unicode.strip)
     cond_set_value(product, 'is_in_store_only',
                    len(box.css('.availability .available')) == 1)
     cond_set_value(product, 'is_out_of_stock',
                    not box.css('.availability .available'))
Beispiel #6
0
    def parse_product(self, response):
        prod = response.meta['product']
        reqs = response.meta.get('reqs', [])

        title = response.xpath('//*[@id="productName"]/text()'
                               '|//*[@class="wag-prod-title"]/text()').extract()
        title = [x.strip() for x in title if x.strip()]
        cond_set(prod, 'title', title)

        no_longer_available = bool(response.xpath(
            '//*[@role="alert"]/span[contains'
            '(text(),"no longer available")]'))

        cond_set_value(prod, 'no_longer_available', no_longer_available)

        img_url = response.xpath(
            '//img[@id="main-product-image"]/@data-src').extract()
        if img_url:
            img_url = urlparse.urljoin(self.site, img_url[0])
            prod['image_url'] = img_url

        prod['url'] = response.url
        prod['locale'] = 'en-US'

        cond_set_value(
            prod,
            'description',
            ''.join(
                response.xpath('//div[@id="description-content"]').extract()),
        )

        cond_set(
            prod,
            'model',
            response.xpath(
                '//section[@class="panel-body wag-colornone"]/text()'
            ).re('Item Code: (\d+)')
        )

        regex = "[Ii][Dd]=prod(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(prod, "reseller_id", reseller_id)

        prod_id = re.findall('ID=(.*)-', response.url)[0]
        review_url = self.REVIEW_API_URL.format(prod_id=prod_id)
        price_variants_url = self.PRICE_VARI_API_URL.format(prod_id=prod_id)

        reqs.append(Request(review_url,
                            meta=response.meta,
                            callback=self._parse_review_api))
        reqs.append(Request(price_variants_url,
                            meta=response.meta,
                            callback=self._parse_price_and_variants))

        if reqs:
            return self.send_next_request(reqs, response)
        return prod
Beispiel #7
0
    def parse_product(self, response):
        product = response.meta['product']

        title_list = response.xpath(
            "//h1[@class='productTitle'][1]//text()").extract()
        if len(title_list) >= 2:
            cond_set_value(product, 'title', self.clear_desc(title_list[-2:]))

        cond_set(
            product, 'price',
            response.xpath(
                "//div[@id='bopRight']//meta[@itemprop='price']/@content").
            extract())

        if product.get('price', None):
            if isinstance(product['price'], str):
                product['price'] = product['price'].decode('utf8')
            if not u'£' in product['price']:
                self.log('Unknown currency at %s' % response.url, level=ERROR)
            else:
                product['price'] = Price(priceCurrency='GBP',
                                         price=product['price'].replace(
                                             u'£',
                                             '').replace(' ', '').replace(
                                                 ',', '').strip())

        img_url = response.xpath(
            "//ul[@id='galleryImages']/li[1]/a/@href").extract()
        if img_url:
            cond_set_value(product, 'image_url',
                           urlparse.urljoin(response.url, img_url[0]))

        cond_set_value(
            product, 'description',
            self.clear_desc(
                response.xpath(
                    "//div[@id='bopBottom']"
                    "//h2[@class='bopSectionHeader' and text()[1]='Product Description'][1]"
                    "/following-sibling::*[@class='bopSection']"
                    "//text()").extract()))

        cond_set_value(product, 'locale', "en_GB")

        regex = "\/(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        cond_set(
            product,
            'brand',
            response.xpath(
                "string(//div[@id='bopBottom']//*[@itemprop='brand'])").
            extract(),
            string.strip,
        )

        return product
Beispiel #8
0
 def _scrape_product_links(self, response):
     product_boxes = response.css('.product')
     for box in product_boxes:
         product = SiteProductItem()
         url = box.css('.title a::attr(href)').extract()[0]
         cond_set(product, 'title', box.css('.title a::text').extract())
         cond_set(product, 'price',
                  box.css('.price .main::text').extract(), string.strip)
         yield url, product
Beispiel #9
0
 def _populate_from_box(self, response, box, product):
     red_span = box.xpath('..//span[@class="red"]/text()').extract()
     if red_span:
         s = 'Currently out of stock'
         if s in red_span[0]:
             cond_set_value(product, 'is_out_of_stock', True)
     else:
         cond_set_value(product, 'is_out_of_stock', False)
     cond_set(product, 'title', _itemprop(box, 'name'), unicode.strip)
     product['title'] = _strip_non_ascii(product.get('title', ''))
     cond_set(product, 'price', _itemprop(box, 'price'))
Beispiel #10
0
    def parse_product(self, response):
        product = response.meta['product']
        vid = 1
        if "vid" in response.meta:
            vid = response.meta['vid']
        if 'OutOfStockNoResults' in response.url:
            self.log("Product OutOfStock %s %s" % (response.url, product),
                     DEBUG)
            return

        if not product.get("price"):
            price = is_empty(
                response.xpath(
                    "//span[@id='priceText']/text() |" \
                    "//div[@id='tabWindow']/noscript"
                ).extract(),
                ""
            )
            price = is_empty(re.findall("\d+\.\d+", price)[::-1])
            if price:
                product["price"] = Price(price=price, priceCurrency="USD")

        title = product.get('title')
        if isinstance(title, str):
            product['title'] = title.decode('utf-8', 'ignore')
            title = product.get('title')
        else:
            title = is_empty(
                response.xpath(
                    "//div[@id='productNameText']/h1/text()").extract())
            if title:
                product["title"] = title

        brindex = title.find("&#153")
        if brindex > 1:
            brand = title[:brindex]
            cond_set_value(product, 'brand', brand)
            # print "BRAND=", brand
        cond_set_value(product, 'brand', self.BRAND)
        cond_set(product, 'description',
                 response.xpath("//div[@id='tabWindow']").extract())
        product['locale'] = "en-US"

        new_meta = response.meta.copy()
        pid = product.get('upc')
        if not pid:
            pid = re.findall("pid=(\d+)", response.url)
            if pid:
                pid = pid[0]
        url = self.PRODUCT_URL_JS.format(pid=pid, vid=vid)
        return Request(url,
                       callback=self._parse_product_js,
                       meta=new_meta,
                       priority=100)
Beispiel #11
0
    def _populate_from_js(self, response, product):
        scripts = response.xpath(
            "//script[contains(text(), 'var utag_data=')]")
        if not scripts:
            self.log("No JS matched in %s." % response.url, WARNING)
            return

        cond_set(product, 'upc', scripts.re("product_sku:'(.+)[']"))
        cond_set(product, 'brand', scripts.re("product_brand:'(.+)[']"))
        price = scripts.re("product_price:'(.+)[']")
        if price:
            product['price'] = Price(price=price[0], priceCurrency='TRY')
Beispiel #12
0
    def parse_product(self, response):
        if response.url != self.product_url:
            product = response.meta['product']

            cond_set(
                product, 'title',
                response.xpath(
                    "//div[contains(@class,'prodTitle')]/h1/span[@itemprop='name']"
                    "/text()").extract())

            # Title key must be present even if it is blank
            cond_set_value(product, 'title', "")
            return product
Beispiel #13
0
 def _populate_from_html(self, response, product):
     reseller_id = re.findall('\/sku(\d+)', response.url)
     # reseller_id = reseller_id[0] if reseller_id else None
     cond_set(product, 'reseller_id', reseller_id)
     cond_set(product, 'title',
              response.css('[itemprop=name]::text').extract())
     cond_set(product, 'brand',
              response.css('#ctl00_content_lnkBrand::text').extract())
     cond_set(product, 'price',
              response.css('[itemprop=price]::text').extract())
     if product.get('price', '') and not isinstance(product['price'], Price):
         if not 'Rp' in product['price']:
             self.log('Unrecognized currency at %s' % response.url)
         else:
             product['price'] = Price(
                 price=product['price'].lower().replace(
                     'rp', '').replace(',', '').strip(),
                 priceCurrency='IDR'
             )
     cond_replace(product, 'image_url',
                  response.css('#prodMedia img::attr(src)').extract())
     specs = response.css('.spesifications').extract()
     specs = specs[0] if specs else ''
     description = product.get('description', '') + specs.strip()
     cond_replace_value(product, 'description', description)
     self._get_model_from_title(product)
    def _populate_from_html(self, response, product):
        cond_set(product, 'title',
                 response.css('.productSummary h1::text').extract())
        cond_set(product, 'price',
                 response.css('.pricePerUnit::text').extract(), unicode.strip)
        cond_set(product, 'price',
                 response.css('.pricing [class*=pricePer]').extract(),
                 unicode.strip)
        xpath = '//*[@id="information"]' \
                '/node()[not(@class="access")][normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_replace(
            product, 'image_url',
            response.css('#productImageHolder img::attr(src)').extract(),
            lambda url: urlparse.urljoin(response.url, url))

        reseller_id = response.xpath('.//*[@class="skuCode"]/text()').extract()
        cond_set(product, 'reseller_id', reseller_id, string.strip)

        title = product['title']
        brand = guess_brand_from_first_words(title, max_words=15)
        cond_set_value(product, 'brand', brand)
        self._unify_price(product)

        if not product.get("locale"):
            product["locale"] = "en_GB"
Beispiel #15
0
    def _populate_from_html(self, response, product):
        prices = response.xpath(
            "//*[@id='priceDivClass']/span/text()").extract()
        cond_set(product, 'price', prices)

        # The description is a possible <p> or just the text of the class,
        # each page is different.
        desc = response.xpath("//*[@class='pIdDesContent']").extract()
        cond_set_value(product, 'description', desc, conv=''.join)

        if not desc:
            desc = response.xpath("//div[@class='descriptContent']").extract()
            if desc:
                del product['description']
                cond_set(product, 'description', desc)

        upcs = response.xpath("//*[@class='skuHidden']/@value").extract()
        cond_set(product, 'upc', upcs)

        # Override the title from other sources. This is the one we want.
        cond_set(product, 'title',
                 response.css('.productTitle h1 ::text').extract())
        self._unify_price(product)
        image_url = response.xpath("//div[contains(@class,'productDetailPic')]"
                                   "/div/a/img/@src").extract()
        if image_url:
            image_url = image_url[0]
            if image_url.startswith("//"):
                image_url = 'http:' + image_url
            product['image_url'] = image_url
Beispiel #16
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'image_url',
                 response.css('[itemprop=image]::attr(src)').extract(),
                 lambda url: urlparse.urljoin(response.url, url))
        _populate_from_open_graph_product(response, product)
        cond_set(product, 'price',
                 response.css('.currentPrice ins::text').extract(),
                 unicode.strip)
        cond_set(product, 'brand',
                 response.css('[itemprop=brand]::text').extract())
        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set(product, 'title',
                 response.css('[itemprop=name]::text').extract())
        css = '#longDesc article'
        desc = response.css(css).extract()
        desc = desc[0] if desc else None
        cond_set_value(product, 'description', desc)

        reseller_id_regex = "(\d+)-pdt"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        self._unify_price(product)
Beispiel #17
0
    def parse_buyer_reviews(self, response):
        product = response.meta.get("product")

        avg = response.xpath("//div[contains(@class,'ratings-summary')]//p[@class='ratingNumber']/span/text()").extract()[0]
        total_data = response.xpath("//div[contains(@class,'ratings-summary')]//p[@itemprop='reviewCount']/text()").extract()[0]
        total = re.findall("Based on ([\d,]+) Ratings", total_data)[0].replace(",", "")

        reviews = {}
        reviews['num_of_reviews'] = total
        reviews['average_rating'] = avg
        reviews['buyer_reviews'] = {}

        cond_set(product, 'buyer_reviews', None)

        return product
Beispiel #18
0
    def _price_from_html(self, response, product):
        css = '.product-price-bol [itemprop=price]::attr(content)'
        cond_replace(product, 'price', response.css(css).extract())
        cond_set(
            product,
            'price',
            response.xpath(
                "//span[@class='offer_price']/meta[@itemprop='price']/@content"
            ).extract())

        currency = response.css('[itemprop=priceCurrency]::attr(content)')
        currency = currency.extract()[0] if currency else 'EUR'
        price = product.get('price', '')
        price = price.replace(',', '.')
        if price and re.match(' *\d+\.?\d* *\Z', price):
            cond_replace_value(product, 'price', Price(currency, price))
Beispiel #19
0
 def _populate_from_box(self, response, box, product):
     cond_set(product, 'title',
              box.css('a[data-item-number]::attr(title)').extract())
     cond_set(product, 'price',
              box.css('.price-point font::text').re('\$([\d ,.]+)'))
     cond_set(product, 'price',
              box.css('.red-message.price-point::text').re('\$([\d ,.]+)'))
     cond_set(product, 'price',
              box.css('.price-point::text').re('\$([\d ,.]+)'))
Beispiel #20
0
 def _get_price(self, response, product):
     """ Parses and sets the product price, with all possible variations
     :param response: Scrapy's Response obj
     :param product: Scrapy's Item (dict, basically)
     :return: None
     """
     cond_set(
         product,
         'price',
         response.css('#priceblock_ourprice ::text'
                      ', #unqualifiedBuyBox .a-color-price ::text'
                      ', #priceblock_saleprice ::text'
                      ', #actualPriceValue ::text'
                      ', #buyNewSection .offer-price ::text').extract(),
     )
     if not product.get('price', None):
         cond_set(
             product, 'price',
             response.xpath(
                 '//td/b[@class="priceLarge"]/text() |'
                 '//span[@class="olp-padding-right"]'
                 '/span[@class="a-color-price"]/text() |'
                 '//div[contains(@data-reftag,"atv_dp_bb_est_hd_movie")]'
                 '/button/text() |'
                 '//span[@id="priceblock_saleprice"]/text() |'
                 '//li[@class="swatchElement selected"]'
                 '//span[@class="a-color-price"]/text() |'
                 '//div[contains(@data-reftag,"atv_dp_bb_est_sd_movie")]'
                 '/button/text() |'
                 '//div[@id="mocaBBRegularPrice"]'
                 '/div/text()[normalize-space()]').extract())
     if product.get('price', None):
         if not '$' in product['price']:
             if 'FREE' in product['price'] or ' ' in product['price']:
                 product['price'] = Price(priceCurrency='USD', price='0.00')
             else:
                 self.log('Currency symbol not recognized: %s' %
                          response.url,
                          level=ERROR)
         else:
             price = re.findall('[\d ,.]+\d', product['price'])
             price = re.sub('[, ]', '', price[0])
             product['price'] = Price(
                 priceCurrency='USD',
                 price=price.replace('$', '').strip()\
                     .replace(',', '')
             )
    def parse_product(self, response):
        product = response.meta['product']
        cond_set_value(product, 'locale', 'en-GB')

        title = response.css('.product-name h1').extract()
        cond_set(product, 'title', title)

        image_url = response.css('#zoom1 img::attr(src)').extract()
        cond_set(product, 'image_url', image_url)

        brand = response.css('.box-brand a img::attr(alt)').extract()
        cond_set(product, 'brand', brand)

        model = response.xpath('//div[@itemprop="name"]/p/text()').extract()
        cond_set(product, 'model', model)

        reseller_id = response.xpath(
            '//*[@class="product-sku"]/text()').extract()
        cond_set(product, 'reseller_id', reseller_id)

        # Is_out_of_stock
        xpath = '//span[@id="availability-box" and text()="Out of stock"]'
        cond_set_value(product, 'is_out_of_stock', response.xpath(xpath), bool)

        # Description
        selection = response.css('.tabs-panels .std .content-wrapper')
        if selection:
            selection = selection[0].xpath('node()[normalize-space()]')
            cond_set_value(product, 'description', selection.extract(),
                           u''.join)

        # Price
        price = response.css('[itemprop=price]::attr(content)')
        currency = response.css('[itemprop=priceCurrency]::attr(content)')
        if price and float(price[0].extract()) and currency:
            cond_set_value(
                product, 'price',
                Price(price=price[0].extract(),
                      priceCurrency=currency[0].extract()))

        self._populate_buyer_reviews(response, product)
        self._populate_related_products(response, product)

        return product
Beispiel #22
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'image_url',
                 response.css('.largeimage::attr(src)').extract())
        cond_set(product, 'title',
                 response.css('.productname::text').extract())
        cond_set(product, 'brand',
                 response.css('.productbrand [itemprop=name]::text').extract())
        delivery_opts = response.css('.deliverycallout li')
        delivery_opts = [bool(do.css('.available')) for do in delivery_opts]
        opt_len = len(filter(None, delivery_opts))
        if opt_len:
            cond_set_value(product, 'is_in_store_only', delivery_opts[1]
                           and opt_len == 1)
        else:
            cond_set_value(product, 'is_out_of_stock', False)
        cond_set(product, 'price',
                 response.css('[itemprop=price]::text').extract(),
                 unicode.strip)
        cond_set(product, 'model',
                 response.css('[itemprop=model]::text').extract())

        regex = "\/(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        price = product.get("price")
        if not re.findall(u'\xa3 *\d[\d, .]*', price):
            price = response.xpath(
                "//ul[contains(@class, 'pricing')]/li[last()]/span/text()"
            ).extract()
            if price:
                price = price[0].strip()
        price = re.findall(u'\xa3 *\d[\d, .]*', price)
        if price:
            price = re.sub(u'[\xa3, ]+', '', price[0])
            cond_replace_value(product, 'price',
                               Price(priceCurrency='GBP', price=price))

        xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        product['url'] = product['url'].rsplit('#', 1)[0]
Beispiel #23
0
    def parse_product_old(self, response):
        prod = response.meta['product']
        # populate_from_open_graph not awailable cause no type=product
        metadata = _extract_open_graph_metadata(response)
        description = response.xpath('//p[@itemprop="description"]//text()').extract()
        if description:
            cond_set_value(prod, 'description', description[0])
        else:
            cond_set_value(prod, 'description', metadata.get('description'))
        cond_set_value(prod, 'title', metadata.get('title'))
        cond_replace_value(prod, 'url', metadata.get('url'))

        img_url = metadata.get('image').rstrip('?$browse_thumbnail$')
        cond_set_value(prod, 'image_url', img_url)
        locale = response.xpath(
            '//meta[@name="gwt:property"]/@content'
        ).re(r'locale=\s*(.*)')
        if locale:
            cond_set_value(prod, 'locale', locale[0])

        re_pattern = r'(\d+,\d+|\d+)'
        price = response.xpath(
            '//span[@itemprop="price"]//span[contains(@class,"price-sales")]//text()'
        ).extract()
        if len(price) > 0:
            price = re.findall(r'[\d\.]+', price[0])
            if len(price) > 0:
                price = price[0].replace(",", "")
        else:
            price = None
        # in case item use usual price, not sale
        if price:
            prod['price'] = Price(
                priceCurrency='USD',
                price=price
            )

        brand = response.xpath(
            '//meta[@itemprop="brand"]/@content'
        ).extract()
        cond_set(prod, 'brand', brand)
        return prod
Beispiel #24
0
    def _populate_from_html(self, response, product):
        self._populate_from_schemaorg(response, product)
        title = response.css("#sku-title ::text").extract()[0]
        if len(re.split(r'\s+-\s+ | -', title, 1)) > 1:
            brand, _ = re.split(r'\s+-\s+', title, 1)
            cond_set(product, 'brand', [brand])

        cond_set(product, 'title', [title])
        cond_set_value(product, 'buyer_reviews',
                       self._get_buyer_reviews(response))
        cond_set(product, 'upc', response.css("#sku-value ::text").extract())
        cond_set(product, 'model',
                 response.css("#model-value ::text").extract())
        self._unify_price(product)
Beispiel #25
0
    def _populate_from_html(self, response, product):
        if 'title' in product and product['title'] == '':
            del product['title']
        cond_set(product,
                 'title',
                 response.xpath('//h1[@itemprop="name"]/text()').extract(),
                 conv=string.strip)

        cond_set(product,
                 'description',
                 response.xpath('//div[@itemprop="description"]').extract(),
                 conv=string.strip)

        image_url = is_empty(
            response.xpath('//div[@id="izView"]/noscript/img/@src').extract())

        if image_url:
            cond_set_value(product, 'image_url', 'http:' + image_url)

        json_data = is_empty(
            response.xpath('//script').re('jcpPPJSON\s?=\s?({.*});'))

        if json_data:
            data = json.loads(json_data)
            brand = is_empty(is_empty(data['products'])['lots']).get(
                'brandName', None)
            cond_set_value(product, 'brand', brand)

        price = is_empty(
            response.xpath(
                '//span[@itemprop="price"]/a/text() |'
                '//span[@itemprop="price"]/text() ').re("\d+.?\d{0,2}"))

        if price:
            product['price'] = Price(price=price, priceCurrency='USD')
        else:
            product['price'] = Price(price='0.0', priceCurrency='USD')
Beispiel #26
0
    def parse_product(self, response):

        product = response.meta['product']

        available = response.xpath(
            '//span[@id="ctl00_ContentPlaceHolder1_ProductControl1_'\
            'MainControl1_ProductMain1_spanLimitedStockCount"]/text()'
        ).extract()
        if available:
            quantity = re.findall("(\d+)", available[0])
            if quantity:
                lim = LimitedStock(is_limited=True,
                                   items_left=int(quantity[0]))
                cond_set(product, 'limited_stock', [lim])

        self._populate_from_open_graph(response, product)

        self._populate_from_js(response, product)

        #title = response.xpath("//title/text()").extract()[0]

        product['locale'] = "tr-TR"

        return product
Beispiel #27
0
    def parse_product(self, response):
        meta = response.meta.copy()
        product = meta.get('product', SiteProductItem())
        reqs = []
        meta['reqs'] = reqs

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title)

        # Parse image
        image = self.parse_image(response)
        cond_set(product, 'image_url', image)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse reseller_id
        cond_set_value(product, "reseller_id", sku)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse description
        description = self.parse_description(response)
        cond_set(product, 'description', description)

        product['related_products'] = self.parse_related_product(response)

        otv = OrientaltradingVariants()
        otv.setupSC(response)
        _variants = otv._variants()
        if _variants:
            product['variants'] = _variants

        # reqs = self.parse_variants(response, reqs)

        # Parse reviews
        reqs.append(
            Request(url=self.REVIEW_URL.format(
                product_id=product['sku'].replace('/', '_'), index=0),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta))

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product
Beispiel #28
0
    def _populate_from_html(self, response, product):
        if 'title' in product and product['title'] == '':
            del product['title']
        cond_set(
            product,
            'title',
            response.xpath(
                '//div[@class="product-detail-content"]/h3/text()'
            ).extract(),
            conv=string.strip
        )
        if not product.get('title', ''):
            title = response.xpath('//h1[contains(@itemprop, "name")]//text()').extract()
            if title:
                product['title'] = title[0].strip()

        cond_set(
            product,
            'brand',
            response.xpath(
                '//div[@class="product-detail-content"]/h5/a/text()'
            ).extract(),
            conv=string.strip
        )
        if not product.get('brand', ""):
            brand = response.xpath(
                    '//h2[contains(@itemprop, "brand")]/a/text()').extract()
            if brand:
                product['brand'] = brand[0].strip()

        cond_set(
            product,
            'description',
            response.xpath(
                '//div[@class="product-catalog-content"]'
            ).extract(),
            conv=string.strip
        )

        image_url = response.xpath(
            "//meta[@property='og:image']/@content"
        ).extract()

        if image_url:
            image = 'http:'+image_url[0]
            product['image_url'] = image

        in_store_only = response.xpath(
            '//div[@id="productBadge"]/img/@data-blzsrc[contains(.,"instore")]')

        if in_store_only:
            product['is_in_store_only'] = True
        else:
            product['is_in_store_only'] = False
    def _populate_from_box(self, response, box, product):
        cond_set(product, 'title',
                 box.css('.productInfo h3 a::text').extract(), unicode.strip)
        cond_set(product, 'price',
                 box.css('.pricePerUnit::text').extract(), unicode.strip)
        cond_set(product, 'price',
                 box.css('.pricing [class*=pricePer]').extract(),
                 unicode.strip)
        cond_set(product, 'image_url',
                 box.css('.productInfo h3 a img::attr(src)').extract(),
                 lambda url: urlparse.urljoin(response.url, url))

        # Try to find brand name in a title
        brands = response.meta.get('brands', [])
        brand = next(
            (brand
             for brand in brands if product.get('title', '').find(brand) == 0),
            None)
        cond_set_value(product, 'brand', brand)
Beispiel #30
0
    def _populate_from_html(self, response, prod):
        # title
        title = response.css('h2[itemprop=name]::text')
        cond_set(prod, 'title', title.extract())

        # price
        price_div = response.css('[itemprop=offers] > [itemprop=price]')
        price_div = price_div[0]
        currency = price_div.css('[itemprop=priceCurrency]::attr(content)')
        price = price_div.css('[itemprop=price]::attr(content)')
        if currency and price:
            prod['price'] = Price(currency[0].extract(), price[0].extract())

        # out of stock
        cond_set_value(prod, 'is_out_of_stock',
                       response.css('.out_of_stock_box'), bool)

        # image
        img = response.css('.vip_gallery [itemprop=image] ::attr(src)')
        cond_set(prod, 'image_url', img.extract())

        # description, merged with details
        desc = response.xpath('//div[@itemprop="description"]/p | '
                              '//ul[@class="linear_list"]')
        cond_set_value(prod, 'description', ''.join(desc.extract()))

        # brand
        brand = response.css('input[name=brand_name] ::attr(value)')
        cond_set(prod, 'brand', brand.extract())

        # reseller_id
        regex = "-(\d+)\."
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(prod, "reseller_id", reseller_id)

        # related products
        related = []
        rel_key = ' '.join(
            response.xpath('//div[@class="moreby_brand"]'
                           '/a/h2//text()').extract())
        rel_items = response.css('#morefrom_slider > ul > li')
        for rel_item in rel_items:
            r_hr = rel_item.css('a::attr(href)')
            r_t = rel_item.css('a > span::text')
            if not r_hr or not r_t:
                continue
            r = RelatedProduct(r_t[0].extract(), r_hr[0].extract())
            related.append(r)
        related_products = {rel_key: related}
        if related_products and related_products.values()[0]:
            cond_set_value(prod, 'related_products', related_products)
Beispiel #31
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'brand', response.css('#brand ::text').extract())
        cond_set(
            product,
            'price',
            response.css('#priceblock_ourprice ::text').extract(),
        )
        cond_set(
            product,
            'description',
            response.css('.productDescriptionWrapper').extract(),
        )
        cond_set(
            product,
            'image_url',
            response.css(
                '#imgTagWrapperId > img ::attr(data-old-hires)').extract()
        )
        cond_set(
            product, 'title', response.css('#productTitle ::text').extract())

        # Some data is in a list (ul element).
        model = None
        for li in response.css('td.bucket > .content > ul > li'):
            raw_keys = li.xpath('b/text()').extract()
            if not raw_keys:
                # This is something else, ignore.
                continue

            key = raw_keys[0].strip(' :').upper()
            if key == 'UPC':
                # Some products have several UPCs. The first one is used.
                raw_upc = li.xpath('text()').extract()[0]
                cond_set(
                    product,
                    'upc',
                    raw_upc.strip().split(' '),
                    conv=int
                )
            elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER':
                model = li.xpath('text()').extract()
        cond_set(product, 'model', model, conv=string.strip)