Ejemplo n.º 1
0
 def _populate_from_js(self, response, product):
     # Images are not always on the same spot...
     img_jsons = response.css(
         '#landingImage ::attr(data-a-dynamic-image)').extract()
     if img_jsons:
         img_data = json.loads(img_jsons[0])
         cond_set_value(
             product,
             'image_url',
             max(img_data.items(), key=lambda (_, size): size[0]),
             conv=lambda (url, _): url)
Ejemplo n.º 2
0
    def parse_product(self, response):
        prod = response.meta['product']

        if not self._has_captcha(response):
            self._populate_from_js(response, prod)

            self._populate_from_html(response, prod)

            cond_set_value(prod, 'locale', 'en-US')  # Default locale.

            result = prod
        elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries:
            self.log("Giving up on trying to solve the captcha challenge after"
                     " %s tries for: %s" % (self.captcha_retries, prod['url']),
                     level=WARNING)
            result = None
        else:
            result = self._handle_captcha(response, self.parse_product)
        return result
Ejemplo n.º 3
0
    def _scrape_product_links(self, response):
        # To populate the description, fetching the product page is necessary.

        url = response.url

        # This will contain everything except for the URL and description.
        product_jsons = response.xpath(
            "//script[@type='text/javascript']/text()"
        ).re(
            r"\s*tesco\.productData\.push\((\{.+?\})\);"
        )
        if not product_jsons:
            self.log("Found no product data on: %s" % url, ERROR)

        product_links = response.css(
            ".product > .desc > h2 > a ::attr('href')").extract()
        if not product_links:
            self.log("Found no product links on: %s" % url, ERROR)

        for product_json, product_link in zip(product_jsons, product_links):
            prod = SiteProductItem()
            cond_set_value(prod, 'url', urlparse.urljoin(url, product_link))

            product_data = json.loads(product_json)

            cond_set_value(prod, 'price', product_data.get('price'))
            cond_set_value(prod, 'image_url', product_data.get('mediumImage'))

            try:
                brand, title = self.brand_from_title(product_data['name'])
                cond_set_value(prod, 'brand', brand)
                cond_set_value(prod, 'title', title)
            except KeyError:
                raise AssertionError(
                    "Did not find title or brand from JS for product: %s"
                    % product_link
                )

            yield None, prod
Ejemplo n.º 4
0
    def _populate_from_html(self, response, product):
        if 'title' in product and product['title'] == '':
            del product['title']
        cond_set(product,
                 'title',
                 response.xpath('//h1[@itemprop="name"]/text()').extract(),
                 conv=string.strip)

        cond_set(product,
                 'description',
                 response.xpath('//div[@itemprop="description"]').extract(),
                 conv=string.strip)

        image_url = is_empty(
            response.xpath('//div[@id="izView"]/noscript/img/@src').extract())

        if image_url:
            cond_set_value(product, 'image_url', 'http:' + image_url)

        json_data = is_empty(
            response.xpath('//script').re('jcpPPJSON\s?=\s?({.*});'))

        if json_data:
            data = json.loads(json_data)
            brand = is_empty(is_empty(data['products'])['lots']).get(
                'brandName', None)
            cond_set_value(product, 'brand', brand)

        price = is_empty(
            response.xpath(
                '//span[@itemprop="price"]/a/text() |'
                '//span[@itemprop="price"]/text() ').re("\d+.?\d{0,2}"))

        if price:
            product['price'] = Price(price=price, priceCurrency='USD')
        else:
            product['price'] = Price(price='0.0', priceCurrency='USD')
Ejemplo n.º 5
0
    def parse_product_mobile(self, response):
        prod = response.meta['product']

        prod['url'] = response.url

        regex = "id=([A-Z0-9\-]+)"
        reseller_id = re.findall(regex, prod.get('url', ''))
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(prod, "reseller_id", reseller_id)

        cond_set(prod, 'locale', ['en-GB'])

        title = response.xpath(
            '//div[contains(@class,"descriptionDetails")]//h1//span[@data-title="true"]//text()'
        ).extract()
        cond_set(prod, 'title', title)

        try:
            brand, title = self.brand_from_title(prod['title'])
            cond_set_value(prod, 'brand', brand)
        except KeyError:
            raise AssertionError(
                "Did not find title or brand from JS for product: %s" %
                response.url)

        img = response.xpath('//*[@id="pdp_image"]/img/@src').extract()
        cond_set(prod, 'image_url', img)

        price = response.xpath('//div[contains(@class,"main_price")]'
                               '/text()').re(FLOATING_POINT_RGEX)
        if price:
            prod['price'] = Price(price=price[0], priceCurrency='GBP')

        desc = response.xpath(
            'string(//p[@class="descriptionText"])').extract()
        cond_set(prod, "description", desc)
        return prod
Ejemplo n.º 6
0
    def _populate_from_html(self, response, product):
        self._populate_hardcoded_fields(product)
        cond_set(product, 'title', response.css('#itemTitle::text').extract())
        cond_set(
            product, 'price',
            response.css('[itemprop=price]::text , '
                         '#mm-saleDscPrc::text').extract(), self._unify_price)

        seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract()

        if seller:
            seller = seller[0].strip()
            product["marketplace"] = [{
                "name": seller,
                "price": product.get("price", None)
            }]

        cond_replace(product, 'image_url',
                     response.css('[itemprop=image]::attr(src)').extract())
        xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_replace(product, 'url',
                     response.css('[rel=canonical]::attr(href)').extract())
        xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'brand', response.xpath(xpath).extract())
        if not product.get('brand', None):
            dump_url_to_file(response.url)
        xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'model', response.xpath(xpath).extract())

        reseller_id_regex = "-\/([^\/&?\.\s]+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)
Ejemplo n.º 7
0
 def populate_bestseller_rank(self, product, response):
     ranks = {
         ' > '.join(
             map(unicode.strip,
                 itm.css('.zg_hrsr_ladder a::text').extract())):
         int(
             re.sub('[ ,]', '',
                    itm.css('.zg_hrsr_rank::text').re('([\d, ]+)')[0]))
         for itm in response.css('.zg_hrsr_item')
     }
     prim = response.css('#SalesRank::text, #SalesRank .value'
                         '::text').re('#([\d ,]+) .*in (.+)\(')
     if prim:
         prim = {prim[1].strip(): int(re.sub('[ ,]', '', prim[0]))}
         ranks.update(prim)
     ranks = [{'category': k, 'rank': v} for k, v in ranks.iteritems()]
     cond_set_value(product, 'category', ranks)
     # parse department
     department = amazon_parse_department(ranks)
     if department is None:
         product['department'] = None
     else:
         product['department'], product['bestseller_rank'] \
             = department.items()[0]
Ejemplo n.º 8
0
 def parse_product(self, response):
     prod = response.meta['product']
     cond_set_value(prod, 'url', response.url)
     cond_set_value(prod, 'locale', 'en-IN')
     self._populate_from_html(response, prod)
     pv = PepperfryVariants()
     pv.setupSC(response)
     variants = pv._variants()
     cond_set_value(prod, 'variants', variants)
     return prod
Ejemplo n.º 9
0
    def parse_product(self, response):
        product = response.meta['product']
        cond_set_value(product, 'locale', 'en-GB')

        title = response.css('.product-name h1').extract()
        cond_set(product, 'title', title)

        image_url = response.css('#zoom1 img::attr(src)').extract()
        cond_set(product, 'image_url', image_url)

        brand = response.css('.box-brand a img::attr(alt)').extract()
        cond_set(product, 'brand', brand)

        model = response.xpath('//div[@itemprop="name"]/p/text()').extract()
        cond_set(product, 'model', model)

        reseller_id = response.xpath(
            '//*[@class="product-sku"]/text()').extract()
        cond_set(product, 'reseller_id', reseller_id)

        # Is_out_of_stock
        xpath = '//span[@id="availability-box" and text()="Out of stock"]'
        cond_set_value(product, 'is_out_of_stock', response.xpath(xpath), bool)

        # Description
        selection = response.css('.tabs-panels .std .content-wrapper')
        if selection:
            selection = selection[0].xpath('node()[normalize-space()]')
            cond_set_value(product, 'description', selection.extract(),
                           u''.join)

        # Price
        price = response.css('[itemprop=price]::attr(content)')
        currency = response.css('[itemprop=priceCurrency]::attr(content)')
        if price and float(price[0].extract()) and currency:
            cond_set_value(
                product, 'price',
                Price(price=price[0].extract(),
                      priceCurrency=currency[0].extract()))

        self._populate_buyer_reviews(response, product)
        self._populate_related_products(response, product)

        return product
Ejemplo n.º 10
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'image_url',
                 response.css('.largeimage::attr(src)').extract())
        cond_set(product, 'title',
                 response.css('.productname::text').extract())
        cond_set(product, 'brand',
                 response.css('.productbrand [itemprop=name]::text').extract())
        delivery_opts = response.css('.deliverycallout li')
        delivery_opts = [bool(do.css('.available')) for do in delivery_opts]
        opt_len = len(filter(None, delivery_opts))
        if opt_len:
            cond_set_value(product, 'is_in_store_only', delivery_opts[1]
                           and opt_len == 1)
        else:
            cond_set_value(product, 'is_out_of_stock', False)
        cond_set(product, 'price',
                 response.css('[itemprop=price]::text').extract(),
                 unicode.strip)
        cond_set(product, 'model',
                 response.css('[itemprop=model]::text').extract())

        regex = "\/(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        price = product.get("price")
        if not re.findall(u'\xa3 *\d[\d, .]*', price):
            price = response.xpath(
                "//ul[contains(@class, 'pricing')]/li[last()]/span/text()"
            ).extract()
            if price:
                price = price[0].strip()
        price = re.findall(u'\xa3 *\d[\d, .]*', price)
        if price:
            price = re.sub(u'[\xa3, ]+', '', price[0])
            cond_replace_value(product, 'price',
                               Price(priceCurrency='GBP', price=price))

        xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        product['url'] = product['url'].rsplit('#', 1)[0]
Ejemplo n.º 11
0
    def _tires_parse_product(self, response):
        product = response.meta['product']

        cond_set(
            product, 'title',
            map(
                string.strip,
                response.xpath("//div[@class='productContent']/h1"
                               "/div[@id='productName']/text()").extract()))

        cond_set(
            product, 'brand',
            response.xpath("//script[contains(text(),'dim7')]"
                           "/text()").re(r'.*"dim7":"([^"]*)"}.*'))

        productid = response.xpath(
            "//p[@id='prodNo']/span[@id='metaProductID']/text()")
        if productid:
            productid = productid.extract()[0].strip().replace('P', '')
            try:
                product['upc'] = int(productid)
            except ValueError:
                self.log("Failed to parse upc number : %r" % productid, ERROR)

        cond_set(
            product, 'image_url',
            response.xpath(
                "//div[@class='bigImage']/img[@id='mainProductImage']"
                "/@src").extract())

        price = response.xpath(
            "//div[contains(@class,'bigPrice')]/div[@class='price']"
            "/descendant::*[text()]/text()")
        price = [x.strip() for x in price.extract()]
        price = "".join(price)
        m = re.match(r'\$(.*)\*.*', price)
        if m:
            price = m.group(1)
        cond_set_value(product, 'price',
                       Price('USD', price) if price else None)

        info = response.xpath("//div[@id='features']/div[@class='tabContent']"
                              "/descendant::*[text()]/text()")
        if info:
            cond_set_value(product, 'description', " ".join(info.extract()))

        cond_set_value(product, 'locale', "en-US")
        return product
Ejemplo n.º 12
0
 def _populate_related_products(self, response, product):
     related_products = {}
     cond_set_value(
         related_products, 'Customers also viewed',
         list(
             self._carousel_getitems(
                 response.css('.skuRightColInner .carouselInner'))) or None)
     cond_set_value(
         related_products, 'Customers also bought',
         list(
             self._carousel_getitems(
                 response.xpath('//h3[text()="Customers also bought"]/..')))
         or None)
     fbt = []
     for item in response.css('.bTogether .formRow'):
         title = item.css('.desc::text')
         url = item.css('.formLabel.SL_m::attr(href)')
         if url and title:
             fbt.append(
                 RelatedProduct(url=urljoin(response.url, url[0].extract()),
                                title=_strip_non_ascii(title[0].extract())))
     cond_set_value(related_products, 'Frequently Bought Together,', fbt
                    or None)
     cond_set_value(product, 'related_products', related_products)
Ejemplo n.º 13
0
    def _populate_from_html(self, response, product):
        title = response.xpath(
            '//div[contains(@class, "product-title")]/h1/text()').extract()
        if isinstance(title, list):
            title = ''.join(title)
        cond_set(product, 'title', (title.strip(), ))
        cond_set(product, 'price',
                 response.css('.saleprice span::text').re('\$([\d ,.]+)'))
        cond_set(product, 'price',
                 response.css('.regprice span::text').re('\$([\d ,.]+)'))
        image_url = is_empty(response.css('.jqzoom img::attr(src)').extract())
        if image_url:
            image_url = is_empty(re.findall("(.*)\?", image_url))
        if not "http" == image_url[:4]:
            image_url = "http:" + image_url
        cond_set(product, 'image_url', (image_url, ))
        cond_set_value(
            product, 'is_out_of_stock', not (response.css(
                '.stockstatus .info::text').re('In Stock|Low Stock')))
        cond_set(product, 'brand',
                 response.css('.alignBrandImageSpec::attr(alt)').extract(),
                 lambda brand: brand.replace('_', ' '))
        xpath = '//td[@class="detailsText"]/text() | ' \
                '//div[contains(@class, "tab-info")]' \
                '/div[contains(@class, "tab-title")]' \
                '/h2[contains(text(), "details")]/../../div'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), u''.join)
        price = product.get('price', None)
        if price == 0:
            del product['price']
        elif price:
            product['price'] = Price(priceCurrency='USD',
                                     price=re.sub('[ ,]', '', price))

        reseller_id_regex = "i=(\d+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        model = response.xpath("//div[@class='item']/text()").re(
            FLOATING_POINT_RGEX)
        cond_set(product, 'model', model)
        self._populate_buyer_reviews(response, product)
Ejemplo n.º 14
0
    def _parse_variants(self, response):
        variants = []
        variants_prop = {}

        variant_search = re.search('Product.Config\((.*)\)', response.body)
        if not variant_search:
            return None

        try:
            variants_json = json.loads(variant_search.group(1))
        except ValueError:
            return None

        for attr_id in response.xpath(
                '//div[not(contains(@class,"hidden"))]/div/'
                'select/@name').re('super_attribute\[(\d+)\]'):

            attribute = variants_json['attributes'][attr_id]
            attribute_name = attribute['label']

            for option in attribute['options']:
                value = option['label']
                for product in option['products']:
                    prop = variants_prop.get(product, {})
                    prop[attribute_name] = value
                    variants_prop[product] = prop

        for variant_id in variants_json['childProducts']:
            vr = {}
            variant = variants_json['childProducts'][variant_id]
            sku = variant.get('productSku')
            cond_set_value(vr, 'skuId', sku)
            final_price = variant.get('finalPrice')
            cond_set_value(vr, 'price', final_price)
            prop = variants_prop[variant_id]
            cond_set_value(vr, 'properties', prop)
            variants.append(vr)

        return variants if variants else None
Ejemplo n.º 15
0
 def _match_walmart_threadsafe(self, response):
     product = response.meta.get('product')
     upc = product.get('upc')
     walmart_category = response.xpath('//p[@class="dept-head-list-heading"]/a/text()').extract()
     walmart_url = response.xpath('//a[@class="js-product-title"][1]/@href').extract()
     if walmart_url:
         walmart_exists = True
         walmart_url = urlparse.urljoin('http://www.walmart.com/', walmart_url[0])
     else:
         walmart_exists = False
     cond_set_value(product, 'walmart_url', walmart_url)
     cond_set_value(product, 'walmart_category', walmart_category)
     cond_set_value(product, 'walmart_exists', walmart_exists)
     # This is for case when both flags are true
     if self.match_target:
         target_url = 'http://tws.target.com/searchservice/item/search_results/v2/by_keyword?search_term={}&alt=json&' \
                      'pageCount=24&response_group=Items&zone=mobile&offset=0'
         req = Request(target_url.format(upc), callback=self._match_target_threadsafe)
         req.meta['product'] = product
         yield req
     else:
         yield product
Ejemplo n.º 16
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'price', response.css('.price span::text').re(
            u'\u00a3([\d, .]+)'))
        cond_set(product, 'title', _itemprop(response, 'model'), unicode.strip)
        cond_set(product, 'brand',
                 _itemprop(_itemprop(response, 'brand', False), 'name'),
                 unicode.strip)
        cond_set(product, 'image_url', _itemprop(response, 'image', False)
                 .css('img::attr(src)').extract())
        image = product.get('image_url')
        if image and image.endswith('noImage.gif'):
            del (product['image_url'])
        cond_set_value(product, 'is_out_of_stock',
                       response.css('.stockMessaging::text').re(
                           'out of stock|Discontinued product'),
                       bool)

        regex = "\/([0-9]+)[\/\?]"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        details = response.css('.prodDetailsContainer').xpath(
            'node()[normalize-space()]')
        details = [d.extract() for d in details if not d.css('form')]
        if details:
            cond_set_value(product, 'description', details, conv=''.join)
        self._populate_related_products(response, product)
        self._populate_buyer_reviews(response, product)
        price = product.get('price', None)
        if price == 0:
            del (product['price'])
        elif price:
            price = re.sub(', ', '', price)
            cond_replace_value(product, 'price', Price(priceCurrency='GBP',
                                                       price=price))
Ejemplo n.º 17
0
    def _populate_buyer_reviews(self, response, product):
        total = response.css(
            '.pr-snapshot-average-based-on-text .count::text').re('[\d ,]+')
        if not total:
            cond_set_value(product, 'buyer_reviews', 0)
            return
        total = int(re.sub('[ ,]', '', total[0]))
        avg = response.css('.pr-rating.pr-rounded.average::text')[0].extract()
        avg = float(avg)
        by_star = response.css('.pr-histogram-count span::text')
        by_star = by_star.re('\(([\d, ]+)\)')
        by_star = {
            i + 1: int(re.sub('[ ,]', '', c))
            for i, c in enumerate(reversed(by_star))
        }

        cond_set_value(
            product, 'buyer_reviews',
            BuyerReviews(num_of_reviews=total,
                         average_rating=avg,
                         rating_by_star=by_star))

        if not total or not avg:
            cond_set_value(product, 'buyer_reviews', 0)
Ejemplo n.º 18
0
    def parse_product(self, response):
        reqs = []
        meta = response.meta.copy()
        product = meta['product']

        # Set locale
        product['locale'] = 'en_GB'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse brand
        brand = self._parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Parse department
        department = self._parse_department(response)
        cond_set_value(product, 'department', department)

        # Parse categories
        categories = self._parse_categories(response)
        cond_set_value(product, 'categories', categories)

        # Parse price
        price, currency = self._parse_price(response)
        price = Price(price=float(price), priceCurrency=currency)
        cond_set_value(product, 'price', price)

        # Parse special pricing
        special_pricing = self._parse_special_pricing(response)
        cond_set_value(product, 'special_pricing', special_pricing, conv=bool)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url, conv=string.strip)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description, conv=string.strip)

        # Parse stock status
        is_out_of_stock = self._parse_stock_status(response)
        cond_set_value(product, 'is_out_of_stock', is_out_of_stock)

        # Parse upc
        upc = self._parse_upc(response)
        cond_set_value(product, 'upc', upc)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse buyer reviews
        reqs.append(
            Request(url=self.BUYER_REVIEWS_URL.format(upc=upc),
                    dont_filter=True,
                    callback=self.br.parse_buyer_reviews))

        # Parse related products
        related_products = self._parse_related_products(response)
        cond_set_value(product, 'related_products', related_products)

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Ejemplo n.º 19
0
    def parse_product(self, response):
        product = response.meta['product']

        populate_from_open_graph(response, product)

        cond_set(
            product,
            'title',
            response.xpath('//meta[@property="og:title"]/@content').extract(),
            conv=string.strip)

        if not product.get('brand', None):
            brand = guess_brand_from_first_words(
                product.get('title', None).strip())
            if brand:
                product['brand'] = brand

        cond_replace(
            product,
            'image_url',
            response.css(
                ".main-image-container>img::attr(src)"
            ).extract(),
            lambda url: urlparse.urljoin(response.url, url)
        )

        prod_description = response.css(
            ".product-details-container .description-copy p::text"
        )
        cond_set_value(product, 'description', "\n".join(
            x.strip() for x in prod_description.extract() if x.strip()))

        sku = response.css(
            ".product-details-container .caption-2::text").extract()
        if sku:
            sku = re.findall('\d+', sku[0])
        else:
            sku = None
        cond_set(product, 'sku', sku, string.strip)

        cond_set(product, 'reseller_id', sku, string.strip)

        price_now = response.css(
            ".product-details-container .right-side .price .sale::text"
            ).extract()
        if not price_now:
            price = response.css(
                ".product-details-container .right-side .price span::text"
                ).extract()
        else:
            price = price_now

        cond_set(
            product,
            'price', price,
            conv=string.strip,
        )
        if price:
            product['price'] = Price(
                price=product['price'].replace(u'\xa3', '').strip(),
                priceCurrency='GBP')

        related_products = self._parse_related(response)
        cond_set_value(product, 'related_products', related_products)

        cond_set_value(product, 'locale', 'en-GB')

        sample = response.xpath('//select[@id="SizeKey"]/option/text()').extract()
        variants = []

        for index, i in enumerate(sample):
            if index > 0:
                var = i.replace('Size ', '')
                variants.append(var.strip())

        variant_list = []
        for variant in variants:
            variant_item = {}
            properties = {}

            if 'out of stock' in variant:
                properties['size'] = variant.replace(' (out of stock)', '')
            else:
                properties['size'] = variant

            variant_item['price'] = price[0].replace(u'\xa3', '').strip()
            variant_item['in_sock'] = False if 'out of stock' in variant else True
            variant_item['properties'] = properties
            variant_item['selected'] = False

            variant_list.append(variant_item)

        product['variants'] = variant_list

        return product
Ejemplo n.º 20
0
    def parse_product(self, response):
        reqs = []
        meta = response.meta.copy()
        product = meta['product']

        # Set locale
        product['locale'] = 'en_GB'

        # Set product id
        product_id = is_empty(
            response.xpath(
                '//input[@name="catCode"]/@value'
            ).extract(), '0'
        )
        response.meta['product_id'] = product_id

        # Set title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Set price
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        # Set special pricing
        special_pricing = self._parse_special_pricing(response)
        cond_set_value(product, 'special_pricing', special_pricing, conv=bool)

        # Set image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Set categories
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)
        if category:
            # Set department
            department = category[-1]
            cond_set_value(product, 'department', department)

        # Set variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)
        # variant_request = Request(
        #     url=self.IMG_URL.format(cat_code=cat_code),
        #     callback=self.info_variant_parse,
        #     dont_filter=True,
        # )

        # reqs.append(variant_request)

        #  Set stock status
        is_out_of_stock = self._parse_stock_status(response)
        cond_set_value(product, 'is_out_of_stock', is_out_of_stock, conv=bool)

        #  Set description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        #  Parse related products
        related_products = self._parse_related_products(response)
        cond_set_value(product, 'related_products', related_products)

        # Parse buyer reviews
        reqs.append(
            Request(
                url=self.BUYER_REVIEWS_URL.format(product_id=product_id),
                dont_filter=True,
                callback=self.br.parse_buyer_reviews
            )
        )

        if reqs:
            yield self.send_next_request(reqs, response)

        yield product
Ejemplo n.º 21
0
    def parse_product(self, response):
        meta = response.meta.copy()
        product = meta.get('product', SiteProductItem())
        if response.status == 404 or "www.dockers.com/US/en_US/error" in response.url:
            product.update({"not_found": True})
            product.update({"no_longer_available": True})
            product.update({"locale": 'en-US'})
            return product
        else:
            product.update({"no_longer_available": False})

        reqs = []
        meta['reqs'] = reqs

        product['ranking'] = response.meta.get('_ranking', None)
        product['total_matches'] = self.total_matches
        product['url'] = response.url
        product['site'] = self.allowed_domains[0]
        product['search_term'] = self.searchterms[0] if self.searchterms else None
        product['scraped_results_per_page'] = product['results_per_page'] = self.PAGINATE_BY

        # product id
        self.product_id = is_empty(response.xpath('//meta[@itemprop="model"]/@content').extract())

        # product data in json
        self.js_data = self.parse_data(response)

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse model
        cond_set_value(product, 'model', self.product_id)

        reseller_id_regex = "p\/([^\/&?\.\s]+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title)

        # Parse image
        image = self.parse_image(response)
        cond_set_value(product, 'image_url', image)

        # Parse brand
        brand = self.parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Parse upc
        upc = self.parse_upc(response)
        cond_set_value(product, 'upc', upc)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse description
        description = self.parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse variants
        variants = self._parse_variants(response)
        product['variants'] = variants

        # Parse product_categories
        self.product_categories = self._extract_categories(response.body_as_unicode())

        response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0}
        real_count = is_empty(re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>',
                                response.body_as_unicode()))
        response.meta['product'] = product
        meta = response.meta
        if real_count:
            # Parse buyer reviews
            if int(real_count) > 8:
                for index, i in enumerate(xrange(9, int(real_count) + 1, 30)):
                    reqs.append(
                        Request(
                            url=self.REVIEW_URL.format(product_id=self.product_id, index=index+2),
                            dont_filter=True,
                            callback=self.parse_buyer_reviews,
                            meta=meta
                        )
            )

        reqs.append(
            Request(
                url=self.REVIEW_URL.format(product_id=self.product_id, index=0),
                dont_filter=True,
                callback=self.parse_buyer_reviews,
                meta=meta
            ))

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Ejemplo n.º 22
0
    def parse_product(self, response):
        product = response.meta['product']

        desc = response.xpath(
            '//div[@id="product-description-full"]/text()').extract()
        if desc:
            product['description'] = desc[0]

        if not product.get("price"):
            _prices = response.xpath('.//*[contains(@class, "price")]')
            price = get_price(_prices)
            if price:
                product["price"] = Price(price=price.replace("\xa3", ""),
                                         priceCurrency="GBP")

        if not product.get("title"):
            title = response.xpath("//h1[@itemprop='name']/text()").extract()
            if title:
                product["title"] = title[0]

        cond_set(product, 'upc', get_upc(response))

        cond_set(
            product, 'brand',
            response.xpath(
                '//div[@id="specs"]'
                '//div[@class="specs-row"]'
                '[contains(./*[@class="specs-name"]/text(), "Brand")]'
                '/*[@class="specs-value"]/text()').extract())

        if not product.get('brand', None):
            brand = guess_brand_from_first_words(product['title'])
            if brand:
                product['brand'] = brand

        reseller_id_regex = "/(\d+)\??"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        related = response.css('#related li.rel-item .rel-title a')
        r = []
        for rel in related:
            title = rel.xpath('text()').extract()
            url = rel.xpath('@href').extract()
            if title and url:
                r.append(
                    RelatedProduct(title=title[0],
                                   url=urlparse.urljoin(response.url, url[0])))
        product['related_products'] = {'recommended': r}

        # get right url if it redirect url
        redirect_pattern = r'&adurl=(.*)'
        res = re.findall(redirect_pattern, product['url'])
        if res:
            try:
                req_url = urllib.unquote(res[0])
                res = urllib.urlopen(req_url)
                url_not_stripped = res.geturl()
                product['url'] = url_not_stripped
            except:
                pass
            review_link = product['buyer_reviews']
            if review_link:
                link = 'https://www.google.co.uk' + review_link
                return Request(link,
                               callback=self.handle_reviews_request,
                               meta=response.meta)
            else:
                product['buyer_reviews'] = ZERO_REVIEWS_VALUE

        # strip GET data from only google urls
        if 'google.co.uk/shopping/product' in product['url']:
            self._populate_buyer_reviews(response, product)
            pattern = r'([^\?]*)'
            result = re.findall(pattern, product['url'])
            if result:
                product['url'] = result[0]
                product['google_source_site'] = "{}"
                stores_link = result[0] + '/online'
                return Request(stores_link,
                               callback=self.populate_stores,
                               meta={
                                   'product': product,
                                   'page': 0
                               })
        return product
Ejemplo n.º 23
0
    def _parse_bazaarv(self, response):
        reqs = response.meta.get('reqs', [])
        product = response.meta['product']
        text = response.body_as_unicode().encode('utf-8')
        if response.status == 200:
            x = re.search(r"var materials=(.*),\sinitializers=", text,
                          re.M + re.S)
            if x:
                jtext = x.group(1)
                jdata = json.loads(jtext)

                html = jdata['BVRRSourceID']
                sel = Selector(text=html)
                avrg = sel.xpath("//div[contains(@id,'BVRRRatingOverall')]"
                                 "/div[@class='BVRRRatingNormalOutOf']"
                                 "/span[contains(@class,'BVRRRatingNumber')]"
                                 "/text()").extract()
                if avrg:
                    try:
                        avrg = float(avrg[0])
                    except ValueError:
                        avrg = 0.0
                else:
                    avrg = 0.0
                total = sel.xpath(
                    "//div[@class='BVRRHistogram']"
                    "/div[@class='BVRRHistogramTitle']"
                    "/span[contains(@class,'BVRRNonZeroCount')]"
                    "/span[@class='BVRRNumber']/text()").extract()
                if total:
                    try:
                        total = int(total[0])
                    except ValueError:
                        total = 0
                else:
                    total = 0

                hist = sel.xpath(
                    "//div[@class='BVRRHistogram']"
                    "/div[@class='BVRRHistogramContent']"
                    "/div[contains(@class,'BVRRHistogramBarRow')]")
                distribution = {}
                for ih in hist:
                    name = ih.xpath("span/span[@class='BVRRHistStarLabelText']"
                                    "/text()").re("(\d) star")
                    try:
                        if name:
                            name = int(name[0])
                        value = ih.xpath(
                            "span[@class='BVRRHistAbsLabel']/text()").extract(
                            )
                        if value:
                            value = int(value[0])
                        distribution[name] = value
                    except ValueError:
                        pass
                if distribution:
                    reviews = BuyerReviews(total, avrg, distribution)
                    cond_set_value(product, 'buyer_reviews', reviews)

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Ejemplo n.º 24
0
    def parse_product(self, response):
        product = response.meta['product']

        cond_set(
            product, 'title',
            response.xpath("//section[@itemscope]/h1"
                           "/span[@itemprop='name']/text()").extract())

        cond_set(
            product, 'brand',
            response.xpath("//section[@itemscope]/h1"
                           "/span[@itemprop='brand']/text()").extract())

        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set(
            product, 'upc',
            response.xpath("//section[@itemscope]/meta[@itemprop='identifier']"
                           "/@content").extract())

        price = response.xpath(
            "//section[@itemscope]/div[contains(@class,'productDetail')]"
            "/section[contains(@class,'description')]"
            "/div/div[contains(@class,'productPrices')]"
            "/span[@itemprop='price']/ins/text()").re(FLOATING_POINT_RGEX)

        if price:
            product['price'] = Price(price=price[0], priceCurrency='GBP')

        cond_set(
            product, 'image_url',
            response.xpath(
                "//section[@itemscope]/descendant::section[@class='productMedias']"
                "/div[@id='currentView']/a/img/@src").extract())

        regex = "(\d+)-pdt"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        if self.DO_DESCRIPTION:
            cond_set(product, 'description',
                     response.xpath("//section[@id='longDesc']").extract())

        cond_set_value(product, 'locale', "en-GB")

        out_of_stock = response.xpath(
            "//div[contains(@class,'productDetail')]"
            "/section[@class='col3']/div[@class='nested']"
            "/strong/text()").re(r"Out of stock")
        if out_of_stock:
            product['is_out_of_stock'] = True

        # review = response.xpath(
        #     "//div[contains(@class,'productDetail')]"
        #     "/section[@class='col3']/p[@id='reviews']"
        #     "/a/@href"
        #     ).extract()

        payload = self._extract_rr_parms(response)
        productid = payload['p']
        product['upc'] = productid

        review_url = (
            'http://mark.reevoo.com/reevoomark/en-GB/product?sku={sku}'
            '&trkref=PCG').format(sku=productid)
        new_meta = response.meta.copy()
        new_meta['handle_httpstatus_list'] = [404]
        reevoo_request = Request(url=review_url,
                                 callback=self._parse_reevoo,
                                 meta=new_meta)
        response.meta['reevoo'] = reevoo_request

        if payload:
            new_meta = response.meta.copy()
            rr_url = urlparse.urljoin(self.SCRIPT_URL,
                                      "?" + urllib.urlencode(payload))
            return Request(rr_url, self._parse_rr_json, meta=new_meta)
        else:
            self.log("No {rr} payload at %s" % response.url, DEBUG)

        return product
Ejemplo n.º 25
0
    def parse_product(self, response):
        product = response.meta['product']

        data = is_empty(
            re.findall("page_products\'\:\s+([^\}]*)",
                       response.body_as_unicode())) + "}"

        try:
            data = json.loads(data.strip().replace("'", "\""))
        except ValueError:
            data = {}

        product["description"] = is_empty(
            response.xpath(
                "//div[contains(@class, 'prd-description')]").extract())

        average = is_empty(
            response.xpath(
                "//span[contains(@class, 'b-rating-average')]/text()").extract(
                ))
        total = is_empty(
            response.xpath("//h2[@class='b-ttl-2']/span/text()").re(
                FLOATING_POINT_RGEX))
        if average and total:
            product["buyer_reviews"] = BuyerReviews(num_of_reviews=total,
                                                    average_rating=average,
                                                    rating_by_star={})

        if data:
            product["price"] = Price(price=data["prod_price"],
                                     priceCurrency=data["currency"])

            product["is_out_of_stock"] = not bool(int(data["stock_available"]))
            product["title"] = data["prod_name"]
            product["image_url"] = data["prod_image_url"]
            product["url"] = data["prod_url"]
            if not product["description"]:
                product["description"] = data["description"]
            product["brand"] = data["brand"]

        else:
            price = is_empty(
                response.xpath("//span[contains(@class, 'price')]/text()").re(
                    FLOATING_POINT_RGEX), None)
            if price:
                product["price"] = Price(price=price, priceCurrency="GBP")
            product["title"] = is_empty(
                response.xpath(
                    "//h1[contains(@class, 'b-ttl-main')]/text()").extract())
            product["image_url"] = is_empty(
                response.xpath(
                    "//*[@id='cart-form']/div[2]/div[1]/div/div/a/@href").
                extract())
            product["url"] = response.url
            product["brand"] = is_empty(
                response.xpath("//span[@itemprop='brand']/text()").extract())

        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set_value(product, 'locale', "en-GB")

        if "You May Also Like" in response.body_as_unicode():
            catId = is_empty(
                re.findall("cat_id\'\:\s+(\d+)", response.body_as_unicode()))
            sid = is_empty(
                re.findall("sid\'\:\s+\"([^\"]*)", response.body_as_unicode()))
            if catId and sid and "item_id" in data:
                url = "http://www.rakuten.co.uk/api/recommendation?" \
                    "category_id=%s" \
                    "&item_id=%s" \
                    "&shop_id=%s" % (catId, data["item_id"], sid)
                return Request(url=url,
                               callback=self._related_parse,
                               meta={"product": product})
        return product
Ejemplo n.º 26
0
    def parse_product(self, response):
        reqs = response.meta.get('reqs', [])
        product = response.meta['product']

        # Set locale
        product['locale'] = 'en_US'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse brand
        brand = self._parse_brand(response)
        cond_set_value(product, 'brand', brand, conv=string.strip)

        # Parse model
        model = self._parse_model(response)
        cond_set_value(product, 'model', model)

        # Parse categories
        categories = self._parse_categories(response)
        cond_set_value(product, 'categories', categories)

        # Parse category
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse reseller_id
        reseller_id = self._parse_reseller_id(response)
        cond_set_value(product, 'reseller_id', reseller_id)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse stock status
        out_of_stock = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', out_of_stock)

        no_longer_available = self._parse_no_longer_available(response)
        cond_set_value(product, 'no_longer_available', no_longer_available)

        related_products = self._parse_related_products(response)
        cond_set_value(product, 'related_products', related_products)

        # Reviews
        bv_product_id = response.xpath(
            '//*[@id="bvProductId"]/@value').extract()
        bv_product_id = bv_product_id[0] if bv_product_id else None
        if not bv_product_id:
            bv_product_id = response.url.split('/')[-1]
        if bv_product_id:
            url = self.RATING_URL.format(prodid=bv_product_id)
            reqs.append(
                Request(url,
                        dont_filter=True,
                        callback=self._parse_bazaarv,
                        meta={
                            'product': product,
                            'reqs': reqs
                        }))
        if reqs:
            return self.send_next_request(reqs, response)

        return product
Ejemplo n.º 27
0
 def parse_coupon(self, response):
     item = DiscountCoupon()
     d = self._parse_description(response)
     cond_set_value(item, 'description', d)
     if not d:
         return
     cond_set_value(item, 'category', self._parse_category(response))
     cond_set_value(item, 'discount', self._parse_discount(response))
     cond_set_value(item, 'conditions', self._parse_conditions(response))
     cond_set_value(item, 'start_date', self._parse_start_date(response))
     cond_set_value(item, 'end_date', self._parse_end_date(response))
     promo_code = None
     if not item.get('promo_code'):
         promo_regex = "[Uu]sing\s?[Pp]romo\s?[Cc]ode:\s?([A-Z0-9]+)"
         promo_code = re.findall(promo_regex, item.get('conditions'))
         promo_code = promo_code[0] if promo_code else None
         if not promo_code:
             promo_code = re.findall(promo_regex, item.get('description'))
             promo_code = promo_code[0] if promo_code else None
     cond_set_value(item, 'promo_code', promo_code)
     return item
Ejemplo n.º 28
0
 def _parse_popup_promo(self, response):
     item = DiscountCoupon()
     description = response.xpath('.//*[@class="subscribe_header"]/text()').extract()
     description = description[0].strip() if description else None
     if description:
         cond_set_value(item, 'description', description)
         cond_set_value(item, 'category', None)
         cond_set_value(item, 'discount', ' '.join(response.xpath(".//*[@id='EmailSignupForm']/p[1]/text()").re('\d+\%')))
         cond_set_value(item, 'conditions', ''.join(response.xpath(".//*[@id='EmailSignupForm']/p[1]/text()").extract()))
         cond_set_value(item, 'start_date', None)
         cond_set_value(item, 'end_date', None)
         cond_set_value(item, 'promo_code', None)
         return item
Ejemplo n.º 29
0
 def _parse_special_promo_code(self, response):
     item = DiscountCoupon()
     description = response.xpath(".//*[@id='mdl-jc-sale-campaign']/p[1]/text()").extract()
     if description:
         cond_set_value(item, 'description', description)
         cond_set_value(item, 'category', None)
         cond_set_value(item, 'discount', response.xpath(".//*[@id='mdl-jc-sale-campaign']/h2/text()").re('\d+\%'))
         cond_set_value(item, 'conditions', response.xpath(".//*[@id='mdl-jc-sale-campaign']/h2/text()").extract())
         cond_set_value(item, 'start_date', None)
         cond_set_value(item, 'end_date', None)
         promo_code = response.xpath(".//*[@id='mdl-jc-sale-campaign']/*[contains(text(), 'code ')]/text()").extract()
         promo_code = ''.join(promo_code).split(' ')
         promo_code = promo_code[-1] if promo_code else None
         cond_set_value(item, 'promo_code', promo_code)
         return item
Ejemplo n.º 30
0
    def parse_product(self, response):
        prod = response.meta.get('product', SiteProductItem())

        prod['_subitem'] = True

        _ranking = response.meta.get('_ranking', None)
        prod['ranking'] = _ranking
        prod['url'] = response.url

        cond_set(prod, 'title', response.css('h1 ::text').extract())
        prod['price'] = DellProductSpider._parse_price(response)
        prod['image_url'] = DellProductSpider._parse_image(response)

        prod['description'] = DellProductSpider._parse_description(response)
        prod['brand'] = DellProductSpider._parse_brand(response,
                                                       prod.get('title', ''))
        prod['related_products'] = self._related_products(response)
        response.meta['product'] = prod
        is_links, variants = self._parse_variants(response)
        if is_links:
            yield variants.pop(0)
        else:
            cond_set_value(prod, 'variants',
                           self._collect_variants_from_dict(variants))

        if 'This product is currently unavailable.' in response.body_as_unicode(
        ):
            prod['is_out_of_stock'] = True
        else:
            yield self._get_stock_status(response,
                                         prod)  # this should be OOS field

        meta = {'product': prod}
        prod_id = self._get_product_id(response)
        if prod_id:  # first page type
            if response.css('#bazaarVoice').extract():
                meta.update({'br_page_type': 1})
                yield Request(  # reviews request
                    url=self.REVIEW_URL.format(product_id=prod_id),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta)
        buyer_reviews_iframe_src = response.xpath(
            '//iframe[contains(@src,"reviews.htm")]/@src').extract()
        if buyer_reviews_iframe_src:  # second page type
            meta.update({'br_page_type': 2})
            yield Request(  # reviews request
                url=buyer_reviews_iframe_src[0].replace('format=noscript', ''),
                dont_filter=True,
                callback=self.parse_buyer_reviews,
                meta=meta)

        try:
            r_url, related_data = self.RELATED_PROD_URL_V1, self._collect_related_products_data_v1(
                response)
        except Exception:
            r_url, related_data = self.RELATED_PROD_URL_V2, self._collect_related_products_data_v2(
                response)
        yield Request(  # related products request
            r_url.format(**related_data),
            callback=self._parse_related_products,
            meta=meta)

        yield prod
Ejemplo n.º 31
0
    def parse_product(self, response):
        reviewed = response.meta.get('reviewed')
        prod = response.meta['product']

        # if there was no any request for item review try to send it
        if not reviewed:
            revs_a = response.xpath('//a[@class="read_reviews_action"]')
            if revs_a:
                avg = revs_a.xpath(
                    './/span[@itemprop="ratingValue"]/text()').extract()
                total = revs_a.xpath(
                    './/span[@itemprop="ratingCount"]/text()').extract()
                rev_url = response.url + '/reviewhtml/all'
                meta = response.meta.copy()
                meta['avg'] = avg
                meta['total'] = total
                meta['initial_response'] = response
                return Request(rev_url,
                               callback=self.populate_reviews,
                               meta=meta)
            else:
                cond_set_value(prod, 'buyer_reviews', ZERO_REVIEWS_VALUE)
        title = response.xpath(
            '//div[@class="product-summary"]/h1/text()').extract()
        cond_set(prod, 'title', title)

        brand = [
            is_empty(re.findall(r'"manufacturer":\s"(.*)",', response.body),
                     None)
        ]
        if not brand:
            if prod.get("title"):
                brand = is_empty([guess_brand_from_first_words(prod['title'])],
                                 None)
        if brand:
            cond_set(prod, 'brand', brand)

        if not prod.get('brand', None):
            dump_url_to_file(response.url)
        price = response.xpath(
            '//p[@class="new-price"]/meta[@itemprop="price"]/@content'
        ).extract()
        priceCurrency = response.xpath(
            '//p[@class="new-price"]/meta[@itemprop="priceCurrency"]/@content'
        ).extract()
        if price and priceCurrency:
            if re.match("\d+(.\d+){0,1}", price[0]):
                prod["price"] = Price(priceCurrency=priceCurrency[0],
                                      price=price[0])
            else:
                prod["price"] = Price(priceCurrency="GBP", price=0.00)
        else:
            prod["price"] = Price(priceCurrency="GBP", price=0.00)

        des = response.xpath('//div[@class="productDescription"]').extract()
        cond_set(prod, 'description', des)

        img_url = response.xpath(
            '//div[@class="product-images"]/img/@src').extract()
        cond_set(prod, 'image_url', img_url)

        cond_set(prod, 'locale', ['en-US'])

        if not prod.get("reseller_id"):
            reseller_id = response.xpath(
                './/*[@itemprop="sku"]/text()').extract()
            cond_set(prod, 'reseller_id', reseller_id)

        prod['url'] = response.url

        available = response.xpath(
            '//form[contains(@id,"addToCartForm")]/input[@type="submit"]/@value'
        ).extract()

        if available and 'Email when back in stock' in available[0]:
            cond_set(prod, 'is_out_of_stock', [True])

        if available and 'Last few in store' in available[0]:
            lim = LimitedStock(is_limited=True, items_left=[1])
            cond_set(prod, 'limited_stock', [lim])

        prod_id = re.findall(r'"id":\s"(.*)",', response.body)
        if prod_id:
            recomm_url = self.RECOMM_URL.format(prod_id=prod_id[0])
            return Request(recomm_url,
                           callback=self.populate_recommendations,
                           meta=response.meta.copy())

        return prod
Ejemplo n.º 32
0
    def _get_products(self, response):
        remaining = response.meta['remaining']
        search_term = response.meta['search_term']
        prods_per_page = response.meta.get('products_per_page')
        total_matches = response.meta.get('total_matches')
        scraped_results_per_page = response.meta.get('scraped_results_per_page')

        if self.deal_product_url_list:
            prods = self._generate_goldbox_links_from_deals(response)
        else:
            prods = self._scrape_product_links(response)

        if prods_per_page is None:
            # Materialize prods to get its size.
            prods = list(prods)
            prods_per_page = len(prods)
            response.meta['products_per_page'] = prods_per_page

        if scraped_results_per_page is None:
            scraped_results_per_page = self._scrape_results_per_page(response)
            if scraped_results_per_page:
                self.log(
                    "Found %s products at the first page" %scraped_results_per_page
                    , INFO)
            else:
                scraped_results_per_page = prods_per_page
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to scrape number of products per page", WARNING)
            response.meta['scraped_results_per_page'] = scraped_results_per_page

        if total_matches is None:
            total_matches = self._scrape_total_matches(response)
            if total_matches is not None:
                response.meta['total_matches'] = total_matches
                self.log("Found %d total matches." % total_matches, INFO)
            else:
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to parse total matches for %s" % response.url,ERROR)

        if total_matches and not prods_per_page:
            # Parsing the page failed. Give up.
            self.log("Failed to get products for %s" % response.url, ERROR)
            return

        for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)):
            # Initialize the product as much as possible.
            prod_item['site'] = self.site_name
            prod_item['search_term'] = search_term
            prod_item['total_matches'] = total_matches
            prod_item['results_per_page'] = prods_per_page
            prod_item['scraped_results_per_page'] = scraped_results_per_page
            # The ranking is the position in this page plus the number of
            # products from other pages.
            prod_item['ranking'] = (i + 1) + (self.quantity - remaining)
            if self.user_agent_key not in ["desktop", "default"]:
                prod_item['is_mobile_agent'] = True

            if prod_url is None:
                # The product is complete, no need for another request.
                yield prod_item
            elif isinstance(prod_url, Request):
                cond_set_value(prod_item, 'url', prod_url.url)  # Tentative.
                yield prod_url
            else:
                # Another request is necessary to complete the product.
                url = urlparse.urljoin(response.url, prod_url)
                cond_set_value(prod_item, 'url', url)  # Tentative.
                yield Request(
                    url,
                    callback=self.parse_product,
                    meta={'product': prod_item},
                )
Ejemplo n.º 33
0
    def parse_product(self, response):
        product = response.meta['product']

        # locale
        product['locale'] = 'en_US'

        product_json = self.extract_product_json(response)

        # title
        title = product_json.get("id_json", {}).get("name", None)
        cond_set_value(product, 'title', title)

        # categories
        categories = product_json.get("style_data", {}).get("categories", [])
        categories = [category_info["value"] for category_info in categories]

        if categories:
            cond_set_value(product, 'categories', categories)

        if product.get('categories'):
            product['category'] = product['categories'][-1]

        # description
        description = response.xpath(
            "//div[@class='description']").extract()[0]
        cond_set_value(product, 'description', description)

        # price
        price = product_json.get("style_data", {}).get("price", None)
        cond_set_value(product, 'price', price)

        # image
        image = product_json.get("id_json", {}).get("image", None)
        if image:
            cond_set_value(product, 'image_url', image)

        # brand
        brand = product_json.get("id_json", {}).get("brand",
                                                    {}).get("name", None)
        cond_set_value(product, "brand", brand)

        # original price
        original_price = product_json.get("style_data",
                                          {}).get("originalPrice", None)
        cond_set_value(product, 'price_original', original_price)

        # no longer available
        availability = response.xpath(
            "//meta[@property='og:availability']/@content").extract()
        if availability:
            no_longer_avail = False if availability[0] == 'instock' else True
        cond_set_value(product, 'no_longer_available', no_longer_avail)
        if product['no_longer_available']:
            product['is_out_of_stock'] = True

        return product