Exemple #1
0
 def _populate_from_html(self, response, product):
     reseller_id = re.findall('\/sku(\d+)', response.url)
     # reseller_id = reseller_id[0] if reseller_id else None
     cond_set(product, 'reseller_id', reseller_id)
     cond_set(product, 'title',
              response.css('[itemprop=name]::text').extract())
     cond_set(product, 'brand',
              response.css('#ctl00_content_lnkBrand::text').extract())
     cond_set(product, 'price',
              response.css('[itemprop=price]::text').extract())
     if product.get('price', '') and not isinstance(product['price'], Price):
         if not 'Rp' in product['price']:
             self.log('Unrecognized currency at %s' % response.url)
         else:
             product['price'] = Price(
                 price=product['price'].lower().replace(
                     'rp', '').replace(',', '').strip(),
                 priceCurrency='IDR'
             )
     cond_replace(product, 'image_url',
                  response.css('#prodMedia img::attr(src)').extract())
     specs = response.css('.spesifications').extract()
     specs = specs[0] if specs else ''
     description = product.get('description', '') + specs.strip()
     cond_replace_value(product, 'description', description)
     self._get_model_from_title(product)
    def _populate_from_html(self, response, product):
        cond_set(product, 'title',
                 response.css('.productSummary h1::text').extract())
        cond_set(product, 'price',
                 response.css('.pricePerUnit::text').extract(), unicode.strip)
        cond_set(product, 'price',
                 response.css('.pricing [class*=pricePer]').extract(),
                 unicode.strip)
        xpath = '//*[@id="information"]' \
                '/node()[not(@class="access")][normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_replace(
            product, 'image_url',
            response.css('#productImageHolder img::attr(src)').extract(),
            lambda url: urlparse.urljoin(response.url, url))

        reseller_id = response.xpath('.//*[@class="skuCode"]/text()').extract()
        cond_set(product, 'reseller_id', reseller_id, string.strip)

        title = product['title']
        brand = guess_brand_from_first_words(title, max_words=15)
        cond_set_value(product, 'brand', brand)
        self._unify_price(product)

        if not product.get("locale"):
            product["locale"] = "en_GB"
Exemple #3
0
    def _price_from_html(self, response, product):
        css = '.product-price-bol [itemprop=price]::attr(content)'
        cond_replace(product, 'price', response.css(css).extract())
        cond_set(
            product,
            'price',
            response.xpath(
                "//span[@class='offer_price']/meta[@itemprop='price']/@content"
            ).extract())

        currency = response.css('[itemprop=priceCurrency]::attr(content)')
        currency = currency.extract()[0] if currency else 'EUR'
        price = product.get('price', '')
        price = price.replace(',', '.')
        if price and re.match(' *\d+\.?\d* *\Z', price):
            cond_replace_value(product, 'price', Price(currency, price))
Exemple #4
0
    def _populate_from_html(self, response, product):
        self._populate_hardcoded_fields(product)
        cond_set(product, 'title', response.css('#itemTitle::text').extract())
        cond_set(
            product, 'price',
            response.css('[itemprop=price]::text , '
                         '#mm-saleDscPrc::text').extract(), self._unify_price)

        seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract()

        if seller:
            seller = seller[0].strip()
            product["marketplace"] = [{
                "name": seller,
                "price": product.get("price", None)
            }]

        cond_replace(product, 'image_url',
                     response.css('[itemprop=image]::attr(src)').extract())
        xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_replace(product, 'url',
                     response.css('[rel=canonical]::attr(href)').extract())
        xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'brand', response.xpath(xpath).extract())
        if not product.get('brand', None):
            dump_url_to_file(response.url)
        xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'model', response.xpath(xpath).extract())

        reseller_id_regex = "-\/([^\/&?\.\s]+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)
    def parse_product(self, response):
        product = response.meta['product']

        populate_from_open_graph(response, product)

        cond_set(
            product,
            'title',
            response.xpath('//meta[@property="og:title"]/@content').extract(),
            conv=string.strip)

        if not product.get('brand', None):
            brand = guess_brand_from_first_words(
                product.get('title', None).strip())
            if brand:
                product['brand'] = brand

        cond_replace(
            product,
            'image_url',
            response.css(
                ".main-image-container>img::attr(src)"
            ).extract(),
            lambda url: urlparse.urljoin(response.url, url)
        )

        prod_description = response.css(
            ".product-details-container .description-copy p::text"
        )
        cond_set_value(product, 'description', "\n".join(
            x.strip() for x in prod_description.extract() if x.strip()))

        sku = response.css(
            ".product-details-container .caption-2::text").extract()
        if sku:
            sku = re.findall('\d+', sku[0])
        else:
            sku = None
        cond_set(product, 'sku', sku, string.strip)

        cond_set(product, 'reseller_id', sku, string.strip)

        price_now = response.css(
            ".product-details-container .right-side .price .sale::text"
            ).extract()
        if not price_now:
            price = response.css(
                ".product-details-container .right-side .price span::text"
                ).extract()
        else:
            price = price_now

        cond_set(
            product,
            'price', price,
            conv=string.strip,
        )
        if price:
            product['price'] = Price(
                price=product['price'].replace(u'\xa3', '').strip(),
                priceCurrency='GBP')

        related_products = self._parse_related(response)
        cond_set_value(product, 'related_products', related_products)

        cond_set_value(product, 'locale', 'en-GB')

        sample = response.xpath('//select[@id="SizeKey"]/option/text()').extract()
        variants = []

        for index, i in enumerate(sample):
            if index > 0:
                var = i.replace('Size ', '')
                variants.append(var.strip())

        variant_list = []
        for variant in variants:
            variant_item = {}
            properties = {}

            if 'out of stock' in variant:
                properties['size'] = variant.replace(' (out of stock)', '')
            else:
                properties['size'] = variant

            variant_item['price'] = price[0].replace(u'\xa3', '').strip()
            variant_item['in_sock'] = False if 'out of stock' in variant else True
            variant_item['properties'] = properties
            variant_item['selected'] = False

            variant_list.append(variant_item)

        product['variants'] = variant_list

        return product
Exemple #6
0
    def _populate_from_html(self, response, product):
        """
        @returns items 1 1
        @scrapes title description locale
        """

        product = response.meta.get('product', SiteProductItem())
        product['reseller_id'] = self._parse_reseller_id(response.url)

        if u'>this product is currently unavailable' in response.body_as_unicode(
        ).lower():
            product['no_longer_available'] = True
            return

        mv = MacysVariants()
        mv.setupSC(response)
        product['variants'] = mv._variants()
        if product.get('variants'):
            # One-variation product
            if len(product.get('variants')) == 1:
                product['upc'] = product.get('variants')[0]['upc']

        if response.xpath('//li[@id="memberItemsTab"]').extract():
            price = response.xpath(
                "//div[@id='memberProductList']/div[1]/"
                "div[@class='productPriceSection']/div/span[last()]/text()"
            ).re(FLOATING_POINT_RGEX)
        else:
            price = response.xpath("//div[@id='priceInfo']/div/span/text()"
                                   ).re(FLOATING_POINT_RGEX)
        if response.css('.priceSale::text'):
            price = response.css('.priceSale::text').re(FLOATING_POINT_RGEX)
        if not price:
            price = response.xpath('//*[contains(@id, "priceInfo")]').re(
                FLOATING_POINT_RGEX)
        if not price:
            price = response.xpath(
                '//*[contains(@class, "singlePrice")][contains(text(), "$")]'
            ).re(FLOATING_POINT_RGEX)

        if not price:
            # TODO Move to another method, populate_from_json
            json_product_data = response.xpath(
                './/script[@id="productMainData"]/text()').extract()
            json_product_data = json.loads(
                json_product_data[0]) if json_product_data else None
            if json_product_data:
                price = [json_product_data.get('salePrice')]
                in_stock = json_product_data.get('inStock', None)
                if in_stock is not None:
                    if in_stock == "true":
                        product['is_out_of_stock'] = False
                    else:
                        product['is_out_of_stock'] = True
        if price:
            product['price'] = Price(price=price[0], priceCurrency='USD')

        if not product.get("image_url") or \
                "data:image" in product.get("image_url"):
            image_url = response.xpath(
                "//img[contains(@id, 'mainView')]/@src").extract()
            if image_url:
                product["image_url"] = image_url[0]
        if not product.get('image_url'):
            cond_set(
                product, 'image_url',
                response.xpath('//*[contains(@class,'
                               ' "productImageSection")]//img/@src').extract())
        if not product.get('image_url'):
            cond_set(
                product, 'image_url',
                response.xpath(
                    '//*[contains(@class, "mainImages")]'
                    '//*[contains(@class, "imageItem")]//img/@src').extract())
        if not product.get("image_url") or \
                "data:image" in product.get("image_url"):
            img_src = response.xpath(
                '//*[contains(@class, "imageItem") '
                'and contains(@class, "selected")]/img/@src').extract()
            if img_src:
                product['image_url'] = img_src[0]

        title = response.css('#productTitle::text').extract()
        if not title:
            title = response.xpath(
                '//*[contains(@class, "productTitle")]'
                '[contains(@itemprop, "name")]/text()').extract()
        if title:
            cond_replace(product, 'title', [''.join(title).strip()])
        if not product.get('title', None):
            title = response.xpath(
                '//h1[contains(@class,"productName")]//text()').extract()
            if title:
                product['title'] = title[0].strip()

        path = '//*[@id="memberProductDetails"]/node()[normalize-space()]'
        desc = response.xpath(path).extract()
        if not desc:
            desc = response.xpath(
                '//*[@id="productDetails"]/node()[normalize-space()]').extract(
                )
            if desc:
                desc = [d for d in desc if 'id="adPool"' not in d]
        cond_set_value(product, 'description', desc, ''.join)
        if not product.get('description', ''):
            product['description'] = (' '.join(
                response.css('#product-detail-control ::text').extract()))

        if not product.get('description', ''):
            desc = response.xpath(
                ".//*[@id='longDescription']/text()").extract()
            product['description'] = desc[0] if desc else ''

        locale = response.css('#headerCountryFlag::attr(title)').extract()
        if not locale:
            locale = response.xpath(
                '//meta[@property="og:locale"]/@content').extract()
        cond_set(product, 'locale', locale)
        brand = response.css('#brandLogo img::attr(alt)').extract()
        if not brand:
            brand = response.xpath(
                './/*[@class="productTitle"]/a[@class="brandNameLink"]/text()'
            ).extract()
        if not brand:
            brand = guess_brand_from_first_words(product['title'].replace(
                u'®', ''))
            brand = [brand]
        cond_set(product, 'brand', brand)

        if product.get('brand', '').lower() == 'levis':
            product['brand'] = "Levi's"

        product_id = response.css('#productId::attr(value)').extract()
        if not product_id:
            product_id = response.xpath(
                '//*[contains(@class,"productID")]'
                '[contains(text(), "Web ID:")]/text()').extract()
            if product_id:
                product_id = [
                    ''.join([c for c in product_id[0] if c.isdigit()])
                ]

        if product_id:  # Reviews
            url = "http://macys.ugc.bazaarvoice.com/7129aa/%s" \
                "/reviews.djs?format=embeddedhtml" % (product_id[0],)

            r = requests.get(url)
            resp = r.text
            resp = re.findall("var materials=(.*)", resp)
            if resp:
                resp = resp[0]
                data = json.loads(resp[0:-1])
                hxs = HtmlXPathSelector(text=data["BVRRSourceID"])

                num_of_reviews = hxs.xpath(
                    '//div[@id="BVRRQuickTakeSummaryID"]'
                    '/div/div/div/div/div/div/div/div/span'
                    '/span[contains(@class, "BVRRNumber")]/text()').extract()
                if num_of_reviews:
                    num_of_reviews = int(num_of_reviews[0].replace(',', ''))
                    array = hxs.xpath(
                        '//div/span[@class="BVRRHistAbsLabel"]/text()'
                    ).extract()
                    if array:
                        rating_by_star = {}
                        array = list(array)
                        array.reverse()
                        count = 0
                        review_sum = 0
                        for i in range(0, 5):
                            rating_by_star[i + 1] = array[i].replace(',', '')
                            count += int(array[i].replace(',', ''))
                            review_sum += (i + 1) * int(array[i].replace(
                                ',', ''))
                        average_rating = round(
                            float(review_sum) / float(count), 2)

                        br = BuyerReviews(num_of_reviews, average_rating,
                                          rating_by_star)

                        cond_set_value(product, 'buyer_reviews', br)
        cond_set_value(product, 'buyer_reviews', ZERO_REVIEWS_VALUE)
        # Related Products
        if product_id:
            aj_url = "http://www1.macys.com/sdp/rto/request/recommendations"
            headers = {
                'Content-type': 'application/x-www-form-urlencoded',
            }
            aj_body = {
                'productId': product_id[0],
                'visitorId': '0',
                'requester': 'MCOM-NAVAPP',
                'context': 'PDP_ZONE_A'
            }

            r = requests.post(aj_url,
                              data=urllib.urlencode(aj_body),
                              headers=headers)
            data = json.loads(r.text)

            rp = []
            rel_prod_links = []
            if data.get('recommendedItems'):
                for el in data["recommendedItems"]:
                    url, title = "", ""
                    link = "http://www1.macys.com/shop/catalog/" \
                        "product/newthumbnail/json?" \
                        "productId=%s&source=118" % (el["productId"],)
                    rel_prod_links.append(link)

                    r = requests.get(link)
                    data = json.loads(r.text)

                    try:
                        title = data["productThumbnail"]["productDescription"]
                        url = "http://www1.macys.com/" + \
                            data["productThumbnail"]["semanticURL"]
                    except Exception:
                        pass

                    if title or url:
                        rp.append(RelatedProduct(title, url))
            if rp:
                recomm = {'Customers Also Shopped': rp}
                product["related_products"] = recomm
Exemple #7
0
    def _populate_from_html(self, response, product):
        """
        @returns items 1 1
        @scrapes title description locale
        """
        product = response.meta.get('product', SiteProductItem())

        mv = MacysVariants()
        mv.setupSC(response)
        product['variants'] = mv._variants()

        if response.xpath('//li[@id="memberItemsTab"]').extract():
            price = response.xpath(
                "//div[@id='memberProductList']/div[1]/"
                "div[@class='productPriceSection']/div/span[last()]/text()"
            ).re(FLOATING_POINT_RGEX)
        else:
            price = response.xpath(
                "//div[@id='priceInfo']/div/span/text()"
            ).re(FLOATING_POINT_RGEX)
        if response.css('.priceSale::text'):
            price = response.css('.priceSale::text').re(FLOATING_POINT_RGEX)
        if not price:
            price = [p.strip() for p in
                     response.xpath('//*[@id="priceInfo"]//text()').extract()
                     if p.strip()]
        if not price:
            price = response.xpath('//*[contains(@id, "priceInfo")]').re(FLOATING_POINT_RGEX)
        if not price:
            price = response.xpath('//*[contains(@class, "singlePrice")][contains(text(), "$")]')
        if price:
            product['price'] = Price(price=price[0],
                                     priceCurrency='USD')

        if not product.get("image_url") or \
                            "data:image" in product.get("image_url"):
            image_url = response.xpath(
                    "//img[contains(@id, 'mainView')]/@src").extract()
            if image_url:
                product["image_url"] = image_url[0]
        if not product.get('image_url'):
            cond_set(
                product, 'image_url',
                response.xpath('//*[contains(@class,'
                               ' "productImageSection")]//img/@src').extract()
            )
        if not product.get('image_url'):
            cond_set(
                product, 'image_url',
                response.xpath('//*[contains(@class, "mainImages")]'
                               '//*[contains(@class, "imageItem")]//img/@src').extract()
            )
        if not product.get("image_url") or \
                        "data:image" in product.get("image_url"):
            img_src = response.xpath('//*[contains(@class, "imageItem") '
                                     'and contains(@class, "selected")]/img/@src').extract()
            if img_src:
                product['image_url'] = img_src[0]


        title = response.css('#productTitle::text').extract()
        if not title:
            title = response.xpath('//*[contains(@class, "productTitle")]'
                                   '[contains(@itemprop, "name")]/text()').extract()
            title = title[0].strip() if title else ''
        if not product.get('title', None):
            title = response.xpath('//h1[contains(@class,"productName")]//text()').extract()
            title = title[0].strip() if title else ''

        if title:
            cond_replace(product, 'title', [''.join(title).strip()])

        path = '//*[@id="memberProductDetails"]/node()[normalize-space()]'
        desc = response.xpath(path).extract()
        if not desc:
            desc = response.xpath(
                '//*[@id="productDetails"]/node()[normalize-space()]'
            ).extract()
            if desc:
                desc = [d for d in desc if 'id="adPool"' not in d]
        cond_set_value(product, 'description',
                       desc, ''.join)
        locale = response.css('#headerCountryFlag::attr(title)').extract()
        if not locale:
            locale = response.xpath(
                '//meta[@property="og:locale"]/@content'
            ).extract()
        cond_set(product, 'locale', locale)
        brand = response.css('#brandLogo img::attr(alt)').extract()
        if not brand:
            brand = response.xpath('.//*[@class="productTitle"]/a[@class="brandNameLink"]/text()').extract()
        if not brand:
            brand = guess_brand_from_first_words(product['title'].replace(u'®', ''))
            brand = [brand]
        cond_set(product, 'brand', brand)

        if product.get('brand', '').lower() == 'levis':
            product['brand'] = "Levi's"

        product_id = response.css('#productId::attr(value)').extract()

        self._parse_reviews(response, product)

        # Related Products
        if product_id:
            aj_url = "http://www1.macys.com/sdp/rto/request/recommendations"
            headers = {
                'Content-type': 'application/x-www-form-urlencoded',
            }
            aj_body = {
                'productId': product_id[0],
                'visitorId': '0',
                'requester': 'MCOM-NAVAPP',
                'context': 'PDP_ZONE_A'
            }

            r = requests.post(
                aj_url,
                data=urllib.urlencode(aj_body),
                headers=headers
            )
            data = json.loads(r.text)

            rp = []
            rel_prod_links = []
            if data.get('recommendedItems'):
                for el in data["recommendedItems"]:
                    url, title = "", ""
                    link = "http://www1.macys.com/shop/catalog/" \
                           "product/newthumbnail/json?" \
                           "productId=%s&source=118" % (el["productId"],)
                    rel_prod_links.append(link)

                    r = requests.get(link)
                    data = json.loads(r.text)

                    try:
                        title = data["productThumbnail"]["productDescription"]
                        url = "http://www1.macys.com/" + \
                              data["productThumbnail"]["semanticURL"]
                    except Exception:
                        pass

                    if title or url:
                        rp.append(RelatedProduct(title, url))
            if rp:
                recomm = {'Customers Also Shopped': rp}
                product["related_products"] = recomm