Beispiel #1
0
    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(OrientaltradingProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)
Beispiel #2
0
 def __init__(self, *args, **kwargs):
     # All this is to set the site_name since we have several
     # allowed_domains.
     self.br = BuyerReviewsBazaarApi()
     super(HomedepotProductsSpider,
           self).__init__(site_name=self.allowed_domains[0],
                          *args,
                          **kwargs)
Beispiel #3
0
 def __init__(self, *args, **kwargs):
     self.br = BuyerReviewsBazaarApi(called_class=self)
     self.referer = None
     self.first_time_products = None
     self.current_page = 1
     self.products_per_page = 20
     super(CvsProductsSpider,
           self).__init__(site_name=self.allowed_domains[0],
                          *args,
                          **kwargs)
     settings.overrides['CRAWLERA_ENABLED'] = True
Beispiel #4
0
    def __init__(self, sort_mode=None, *args, **kwargs):
        from scrapy.conf import settings
        settings.overrides['DEPTH_PRIORITY'] = 1
        settings.overrides[
            'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue'
        settings.overrides[
            'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue'

        self.quantity = kwargs.get('quantity', 1000)  # default is 1000

        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(DellProductSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)
Beispiel #5
0
 def __init__(self, *args, **kwargs):
     self.br = BuyerReviewsBazaarApi(called_class=self)
     self.index = 1
     self.error_pagin = 0
     self.pages_pagin = []
     self.count_pagin_page = 0
     self.count_pagin_links = 0
     super(NeweggProductSpider, self).__init__(*args, **kwargs)
Beispiel #6
0
    def __init__(self, search_sort='recommended', *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(HalfordsProductSpider, self).__init__(
            site_name=self.allowed_domains[0],
            url_formatter=FormatterWithDefaults(
                sort=self._SORT_MODES[search_sort]
            ),
            *args, **kwargs)
    def __init__(self, search_sort='NEWEST', *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(HouseoffraserProductSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             url_formatter=FormatterWithDefaults(
                                 sort_mode=self._SORT_MODES[search_sort]),
                             *args,
                             **kwargs)
Beispiel #8
0
 def __init__(self, sort_mode=None, *args, **kwargs):
     if sort_mode not in self.SORT_MODES:
         sort_mode = 'default'
     self.SORT = self.SORT_MODES[sort_mode]
     self.pages = dict()
     self.br = BuyerReviewsBazaarApi(called_class=self)
     super(HomebaseProductSpider,
           self).__init__(site_name=self.allowed_domains[0],
                          *args,
                          **kwargs)
 def __init__(self, *args, **kwargs):
     self.br = BuyerReviewsBazaarApi(called_class=self)
     # officedepot seems to have a bot protection, so we first get the cookies
     # and parse the site with them after that
     self.proxy = None
     self.timeout = 60
     self.width = 1024
     self.height = 768
     self.selenium_cookies = {}
     self.user_agent = (
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
         ' (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36')
     socket.setdefaulttimeout(60)
     self._get_selenium_cookies_for_main_page()
     if kwargs.get('scrape_variants_with_extra_requests'):
         self._extra_requests = True
     super(OfficedepotProductsSpider,
           self).__init__(site_name=self.allowed_domains[0],
                          *args,
                          **kwargs)
Beispiel #10
0
    def __init__(self, sort_mode=None, *args, **kwargs):
        from scrapy.conf import settings
        settings.overrides['DEPTH_PRIORITY'] = 1
        settings.overrides[
            'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue'
        settings.overrides[
            'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue'
        settings.overrides['CRAWLERA_ENABLED'] = True

        self.quantity = kwargs.get('quantity', 1000)  # default is 1000

        self.proxy = 'content.crawlera.com:8010'
        self.proxy_type = 'http'
        #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
        self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'

        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(NikeProductSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)
Beispiel #11
0
    def __init__(self, sort_mode=None, *args, **kwargs):
        self.buyer_reviews = BuyerReviewsBazaarApi(called_class=self)
        if sort_mode:
            if sort_mode.lower() not in self.SORT_MODES:
                self.log('"%s" not in SORT_MODES')
            else:
                self.SORTING = self.SORT_MODES[sort_mode.lower()]

        super(JcpenneyProductsSpider,
              self).__init__(url_formatter=FormatterWithDefaults(
                  sort_mode=self.SORTING or self.SORT_MODES['default']),
                             site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)
        settings.overrides['CONCURRENT_REQUESTS'] = 1
        self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
Beispiel #12
0
class DockersProductsSpider(BaseValidator, BaseProductsSpider):
    name = 'dockers_products'
    allowed_domains = ["dockers.com", "www.dockers.com"]
    start_urls = []

    settings = DockersValidatorSettings

    SEARCH_URL = "http://www.dockers.com/US/en_US/search?Ntt={search_term}"  # TODO: ordering

    PAGINATE_URL = ('http://www.dockers.com/US/en_US/includes/searchResultsScroll/?nao={nao}'
                    '&url=%2FUS%2Fen_US%2Fsearch%3FD%3D{search_term}%26Dx'
                    '%3Dmode%2Bmatchall%26N%3D4294961104%2B4294961101%2B4294965619%26Ntk'
                    '%3DAll%26Ntt%3D{search_term}%26Ntx%3Dmode%2Bmatchall')

    CURRENT_NAO = 0
    PAGINATE_BY = 12  # 12 products
    TOTAL_MATCHES = None  # for pagination

    total_matches = None

    REVIEW_URL = "http://dockers.ugc.bazaarvoice.com/2080-en_us/{product_id}" \
                 "/reviews.djs?format=embeddedhtml&page={index}&"

    RELATED_PRODUCT = "https://levis.tt.omtrdc.net/m2/levis/mbox/ajax?" \
                      "mboxHost=www.dockers.com" \
                      "&mboxSession=1481449902450-970396" \
                      "&mboxCount=1" \
                      "&entity.id={product_id}" \
                      "&entity.categoryId={product_categories}" \
                      "&mbox=target-global-mbox" \
                      "&mboxId=0" \
                      "&mboxURL={product_url}" \
                      "&mboxReferrer=http://www.dockers.com/" \
                      "&mboxVersion=60"

    use_proxies = True
    handle_httpstatus_list = [404]

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(DockersProductsSpider, self).__init__(
            site_name=self.allowed_domains[0], *args, **kwargs)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _init_firefox(self, proxy):
        from selenium import webdriver
        from selenium.webdriver.remote.remote_connection import RemoteConnection
        RemoteConnection.set_timeout(30)
        profile = webdriver.FirefoxProfile()
        #profile.set_preference("general.useragent.override", self.user_agent)
        profile.set_preference('intl.accept_languages', 'en-US')
        profile.set_preference("network.proxy.type", 1)  # manual proxy configuration
        if proxy:  # we assume only http proxies are accepted, format: http://host:port
            proxy, port = proxy.replace('http://', '').split(':')
            profile.set_preference("network.proxy.http", proxy)
            profile.set_preference("network.proxy.http_port", int(port))
        profile.update_preferences()
        driver = webdriver.Firefox(profile)
        driver.set_window_size(1280, 1024)
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)
        return driver

    def _is_product_page(self, response):
        return 'is_product_page' in response.meta

    def _get_product_links_from_serp(self, driver):
        result = []
        for l in driver.find_elements_by_xpath(
            '//li[contains(@class, "product-tile")]'
            '//a[contains(@rel, "product")]'
        ):
            href = l.get_attribute('href')
            if href:
                if not href.startswith('http'):
                    href = urlparse.urljoin('http://' + self.allowed_domains[0], href)
                result.append(href)
        return result

    @staticmethod
    def last_six_digits_the_same(lst):
        print lst
        if len(lst) < 7:
            return
        return len(set(lst[-6:-1])) == 1  # if all elements are the same, set's length will be 1

    def parse(self, response):
        proxy = response.request.meta.get('proxy', None)

        if not self._is_product_page(response):
            self.total_matches = self._scrape_total_matches(response)

            display = Display(visible=0, size=(1280, 1024))
            display.start()

            product_links = []
            # scrape "quantity" products
            driver = self._init_firefox(proxy=proxy)
            try:
                driver.get('http://www.dockers.com/US/en_US/')
            except Exception as e:
                print(str(e))
                self.log(str(e))
            driver.find_element_by_name('Ntt').send_keys(self.searchterms[0] + '\n')
            time.sleep(10)  # let AJAX finish
            new_meta = response.meta.copy()
            # get all products we need (scroll down)
            collected_products_len = []
            num_of_errors = 0
            while True:
                try:
                    driver.execute_script("scrollTo(0,50000)")
                    time.sleep(10)
                    product_links = self._get_product_links_from_serp(driver)
                    collected_products_len.append(len(product_links))
                    if self.last_six_digits_the_same(collected_products_len):
                        break  # last six iterations collected equal num of products
                    if len(product_links) > self.quantity:
                        break
                    print 'Collected %i product links' % len(product_links)
                    self.log('Collected %i product links' % len(product_links))
                    self.log('Statistics: %s' % report_statistics())
                except Exception as e:
                    print str(e)
                    self.log('Error while doing pagination %s' % str(e), WARNING)
                    num_of_errors += 1
                    if num_of_errors > 10:
                        self.log('Too many webdriver errors', ERROR)
                        driver.quit()
                        display.stop()
                        return

            #driver.save_screenshot('/tmp/1.png')
            new_meta['is_product_page'] = True
            for i, product_link in enumerate(product_links):
                new_meta['_ranking'] = i+1
                yield Request(product_link, meta=new_meta, callback=self.parse_product)

            driver.quit()
            display.stop()

    def parse_product(self, response):
        meta = response.meta.copy()
        product = meta.get('product', SiteProductItem())
        if response.status == 404 or "www.dockers.com/US/en_US/error" in response.url:
            product.update({"not_found": True})
            product.update({"no_longer_available": True})
            product.update({"locale": 'en-US'})
            return product
        else:
            product.update({"no_longer_available": False})

        reqs = []
        meta['reqs'] = reqs

        product['ranking'] = response.meta.get('_ranking', None)
        product['total_matches'] = self.total_matches
        product['url'] = response.url
        product['site'] = self.allowed_domains[0]
        product['search_term'] = self.searchterms[0] if self.searchterms else None
        product['scraped_results_per_page'] = product['results_per_page'] = self.PAGINATE_BY

        # product id
        self.product_id = is_empty(response.xpath('//meta[@itemprop="model"]/@content').extract())

        # product data in json
        self.js_data = self.parse_data(response)

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse model
        cond_set_value(product, 'model', self.product_id)

        reseller_id_regex = "p\/([^\/&?\.\s]+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title)

        # Parse image
        image = self.parse_image(response)
        cond_set_value(product, 'image_url', image)

        # Parse brand
        brand = self.parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Parse upc
        upc = self.parse_upc(response)
        cond_set_value(product, 'upc', upc)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse description
        description = self.parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse variants
        variants = self._parse_variants(response)
        product['variants'] = variants

        # Parse product_categories
        self.product_categories = self._extract_categories(response.body_as_unicode())

        response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0}
        real_count = is_empty(re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>',
                                response.body_as_unicode()))
        response.meta['product'] = product
        meta = response.meta
        if real_count:
            # Parse buyer reviews
            if int(real_count) > 8:
                for index, i in enumerate(xrange(9, int(real_count) + 1, 30)):
                    reqs.append(
                        Request(
                            url=self.REVIEW_URL.format(product_id=self.product_id, index=index+2),
                            dont_filter=True,
                            callback=self.parse_buyer_reviews,
                            meta=meta
                        )
            )

        reqs.append(
            Request(
                url=self.REVIEW_URL.format(product_id=self.product_id, index=0),
                dont_filter=True,
                callback=self.parse_buyer_reviews,
                meta=meta
            ))

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_variants(self, response):
        """
        Parses product variants.
        """
        dk = DockersVariants()
        dk.setupSC(response)
        variants = dk._variants()

        return variants

    def parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response)

        for k, v in buyer_reviews_per_page['rating_by_star'].iteritems():
            response.meta['marks'][k] += v

        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = BuyerReviews(
            num_of_reviews=buyer_reviews_per_page['num_of_reviews'],
            average_rating=buyer_reviews_per_page['average_rating'],
            rating_by_star=response.meta['marks']
            )

        # Updated related product url, previous res-x doesn't work
        product_id = self.product_id + 'US'
        url = self.RELATED_PRODUCT.format(product_id=product_id,
                                          product_categories=self.product_categories,
                                          product_url=product.get('url'))
        reqs.append(
            Request(
                url=url,
                dont_filter=True,
                callback=self.parse_related_product,
                meta=meta
            ))

        return self.send_next_request(reqs, response)

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """

        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)

    def parse_brand(self, response):
        brand = is_empty(response.xpath(
            '//meta[@itemprop="brand"]/@content').extract())

        return brand

    def parse_title(self, response):
        title = response.xpath(
            '//meta[contains(@property,"og:title")]/@content').extract()
        if not title:
            title = response.xpath(
                '//meta[contains(@name,"og:title")]/@content').extract()
        return title

    def parse_data(self, response):
        data = re.findall(r'var buyStackJSON = \'(.+)\'; ', response.body_as_unicode())
        if data:
            data = re.sub(r'\\(.)', r'\g<1>', data[0])
            try:
                js_data = json.loads(data)
            except:
                return
            return js_data

    def parse_image(self, response):
        if self.js_data:
            try:
                image = self.js_data['colorid'][self.product_id]['gridUrl']
            except:
                return

            return image

    def parse_related_product(self, response):
        product = response.meta['product']
        text = self._extract_related_products_json(response.body_as_unicode())
        related_products = self._build_related_products_array(text, product)
        if related_products:
            product['related_products'] = {}
            product['related_products']['buyers_also_bought'] = related_products
        return product

    def parse_description(self, response):
        if self.js_data:
            try:
                description = self.js_data['colorid'][self.product_id]['name']
            except:
                return
            return description

    def parse_upc(self, response):
        if self.js_data:
            for v in self.js_data['sku'].values():
                upc = v['upc']
            upc = upc[-12:]
            if len(upc) < 12:
                count = 12-len(upc)
                upc = '0'*count+upc
            return upc

    def parse_sku(self, response):
        if self.js_data:
            for v in self.js_data['sku'].values():
                skuid = v['skuid']
            return skuid

    def parse_price(self, response):
        if self.js_data:
            price = self.js_data['colorid'][self.product_id]['price']
            for price_data in price:
                if price_data['il8n'] == 'now':
                    price = price_data['amount']
            currency = is_empty(re.findall(r'currency":"(\w+)"', response.body_as_unicode()))

            if price and currency:
                price = Price(price=price, priceCurrency=currency)
            else:
                price = Price(price=0.00, priceCurrency="USD")

            return price

    def _scrape_total_matches(self, response):
        totals = response.css('.productCount ::text').extract()
        if totals:
            totals = totals[0].replace(',', '').replace('.', '').strip()
            if totals.isdigit():
                if not self.TOTAL_MATCHES:
                    self.TOTAL_MATCHES = int(totals)
                return int(totals)

    def _scrape_product_links(self, response):
        for link in response.xpath(
            '//li[contains(@class, "product-tile")]'
            '//a[contains(@rel, "product")]/@href'
        ).extract():
            yield link, SiteProductItem()

    def _get_nao(self, url):
        nao = re.search(r'nao=(\d+)', url)
        if not nao:
            return
        return int(nao.group(1))

    def _replace_nao(self, url, new_nao):
        current_nao = self._get_nao(url)
        if current_nao:
            return re.sub(r'nao=\d+', 'nao='+str(new_nao), url)
        else:
            return url+'&nao='+str(new_nao)

    def _scrape_next_results_page_link(self, response):
        if self.TOTAL_MATCHES is None:
            self.log('No "next result page" link!')
            return
        if self.CURRENT_NAO > self.TOTAL_MATCHES+self.PAGINATE_BY:
            return  # it's over
        self.CURRENT_NAO += self.PAGINATE_BY

        return Request(
            self.PAGINATE_URL.format(
                search_term=response.meta['search_term'],
                nao=str(self.CURRENT_NAO)),
            callback=self.parse, meta=response.meta
        )

    @staticmethod
    def _extract_categories(body):
        pattern = re.compile('var\s+categoryIds\s*=\s*\'(.+?)\;')
        categories = pattern.search(body)
        return categories.group(1) if categories else None

    def _extract_related_products_json(self, body):
        pattern = re.compile('\_AT\.applyWhenReady\(\s*\[\s*({.+?})\s*\]\s*\)\s*;', re.DOTALL)
        related_products_json = pattern.search(body)
        data = related_products_json.group(1) if related_products_json else None
        try:
            data = json.loads(data).get('content')
            return data
        except Exception as e:
            self.log('{}'.format(e.message))
            return None

    @staticmethod
    def _build_related_products_array(text, product):
        s = Selector(text=text)
        related_products = []
        product_url = product.get('url')
        for element in s.xpath('//li[contains(@class, "imagegrid")]'):
            url = element.xpath('.//a/@href').extract()
            title = element.xpath('.//p[@class="name"]/text()').extract()
            if url and title:
                url = urlparse.urljoin(product_url, url[0])
                title = title[0]
                related_products.append(RelatedProduct(url=url, title=title))
        return related_products
Beispiel #13
0
class DellProductSpider(BaseProductsSpider):
    name = 'dell_products'
    allowed_domains = ["dell.com", "recs.richrelevance.com"]

    handle_httpstatus_list = [404, 403, 502, 520]

    SEARCH_URL = "http://pilot.search.dell.com/{search_term}"
    REVIEW_URL = "http://reviews.dell.com/2341_mg/{product_id}/reviews.htm?format=embedded"
    VARIANTS_URL = "http://www.dell.com/api/configService.svc/postmoduleoverrides/json"
    VARIANTS_DATA = {
        'c': 'us',
        'l': 'en',
        's': 'dhs',
        'cs': '19',
        'moduleTemplate': 'products/ProductDetails/mag/config_popup_mag',
        'modErrorTemplate': 'products/module_option_validation',
        'resultType': 'SingleModule',
        'productCode': 'undefined'
    }
    # there are two types of product pages, each of them requires different related products processing
    RELATED_PROD_URL_V1 = (
        "http://recs.richrelevance.com/rrserver/p13n_generated.js?"
        "pt=|item_page.mag_syspdpoc1|item_page.mag_syspdpoc2|item_page.mag_syspdpoc3|item_page.mag_syspdpoc4|item_page.mag_syspdpoc5&"
        "a=usdhsa5d5af7012d61fd1&rid=us_19_en_dhs&sgs=|us_19_en_dhs:us_19_en_dhs&flv=15.0.0&"
        "s=undefined{date}&n={n}&chi={chi}&ts={ts}&p={p}")
    RELATED_PROD_URL_V2 = (
        "http://recs.richrelevance.com/rrserver/p13n_generated.js?"
        "pt=|item_page.storm_snp_pdp1|item_page.storm_snp_pdp2|item_page.storm_snp_pdp3|item_page.storm_snp_pdp4|item_page.storm_snp_pdp5&"
        "sgs=|us_04_en_bsd:us_04_en_bsd&rid=us_04_en_bsd&flv=11.2.999&l=1&"
        "u=ykOA15fokzi417dpJeveUF65A0NwWJeGhQ6pvWEfbCuYOurQKpNgzVVXCdsYKqf4&"
        "s=ykOA15fokzi417dpJeveUF65A0NwWJeGhQ6pvWEfbCuYOurQKpNgzVVXCdsYKqf4{date}&"
        "a=usbsda5d5af7012d61fd1&ts={ts}&p={p}")

    def __init__(self, sort_mode=None, *args, **kwargs):
        from scrapy.conf import settings
        settings.overrides['DEPTH_PRIORITY'] = 1
        settings.overrides[
            'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue'
        settings.overrides[
            'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue'

        self.quantity = kwargs.get('quantity', 1000)  # default is 1000

        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(DellProductSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def start_requests(self):
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote(st.encode('utf-8')),
                ),
                meta={
                    'search_term': st,
                    'remaining': self.quantity
                },
            )

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod})

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _get_product_links_from_serp(self, driver):
        results = []
        links = driver.find_elements_by_xpath(
            '//h4/../../a[contains(@href, "/")]')
        for l in links:
            href = l.get_attribute('href')
            if href:
                if not href.startswith('http'):
                    href = urlparse.urljoin(
                        'http://' + self.allowed_domains[0], href)
                results.append(href)
        return results

    def _is_product_page(self, response):
        return 'is_product_page' in response.meta

    def _init_webdriver(self):
        from selenium import webdriver
        from selenium.webdriver.remote.remote_connection import RemoteConnection
        RemoteConnection.set_timeout(30)
        driver = webdriver.Firefox()
        driver.set_window_size(1280, 1024)
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)
        return driver

    def parse(self, response):

        if not self._is_product_page(response):
            product_links = []
            # scrape "quantity" products
            display = Display(visible=0, size=(1280, 1024))
            display.start()
            driver = self._init_webdriver()
            driver.get(response.url)
            time.sleep(6)  # let AJAX finish
            new_meta = response.meta.copy()
            # get all products we need (or till the "show more products" button exists)
            paging_button = '//button[contains(@id, "paging-button")]'
            num_of_errors = 0
            while driver.find_elements_by_xpath(paging_button):
                try:
                    button = driver.find_elements_by_xpath(paging_button)
                    button[0].click()
                    time.sleep(4)
                    product_links = self._get_product_links_from_serp(driver)
                    if len(product_links) > self.quantity:
                        break
                    print 'Collected %i product links' % len(product_links)
                    self.log('Collected %i product links' % len(product_links))
                except Exception as e:
                    print str(e)
                    self.log('Error while doing pagination: %s' % str(e),
                             WARNING)
                    num_of_errors += 1
                    if num_of_errors > 10:
                        self.log('Too many webdriver errors', ERROR)
                        driver.quit()
                        display.stop()
                        return

            #driver.save_screenshot('/tmp/1.png')
            new_meta['is_product_page'] = True
            for i, product_link in enumerate(product_links):
                new_meta['_ranking'] = i + 1
                yield Request(product_link,
                              meta=new_meta,
                              callback=self.parse_product)

            driver.quit()
            try:
                display.stop()
            except Exception as e:
                self.log('Exception on display.stop(): [%s]' % str(e))

    @staticmethod
    def _parse_price(response):
        dell_price = response.xpath('//*[contains(text(), "Dell Price")]')
        dell_price = re.search(
            '\$([\d,]+\.\d+)',
            ''.join(dell_price.xpath('./..//text()').extract()))
        if dell_price:
            dell_price = dell_price.group(1)
            price = Price(price=dell_price, priceCurrency='USD')
            return price
        price = response.xpath('//*[contains(@name, "pricing_sale_price")]'
                               '[contains(text(), "$")]//text()').extract()
        if not price:
            price = response.xpath(
                '//*[contains(@name, "pricing_retail_price")]'
                '[contains(text(), "$")]//text()').extract()
        if price:
            price = Price(price=price[0].strip().replace('$', ''),
                          priceCurrency='USD')
            return price

    @staticmethod
    def _parse_image(response):
        img_src = response.xpath(
            '//*[contains(@id, "product_main_image")]'
            '//img[contains(@src, ".jp")]/@src').extract()
        if not img_src:
            img_src = response.xpath(
                '//*[contains(@class, "oneImageUp")]'
                '//img[contains(@src, ".jp")]/@src').extract()
        if not img_src:
            img_src = response.xpath(
                '//*[contains(@class, "leftRightMainImg")]'
                '//img[contains(@src, ".jp")]/@src').extract()
        if not img_src:
            img_src = response.xpath(
                '//*[contains(@class, "oneImageUp")]'
                '//img[contains(@data-original, ".jp")]/@data-original'
            ).extract()
        if img_src:
            return img_src[0]

    @staticmethod
    def _parse_brand(response, prod_title):
        # <meta itemprop="brand" content = "DELL"/>
        brand = response.xpath(
            '//meta[contains(@itermprop, "brand")]/@content').extract()
        if not brand:
            brand = response.xpath(
                '//a[contains(@href, "/brand.aspx")]/img/@alt').extract()
        if brand:
            return brand[0].title()
        if prod_title:
            brand = guess_brand_from_first_words(prod_title)
            if not brand:
                prod_title = prod_title.replace('New ', '').strip()
                brand = guess_brand_from_first_words(prod_title)
            if brand:
                return brand

    @staticmethod
    def _parse_description(response):
        desc = response.xpath('//*[@id="cntTabsCnt"]').extract()
        if not desc:
            desc = response.xpath(
                './/*[@id="AnchorZone3"]'
                '//div[not(contains(@class, "anchored_returntotop"))]'
            ).extract()
        if desc:
            return desc[0]

    def _related_products(self, response):
        results = []
        rps = response.xpath(
            '//*[contains(@class, "psItemDescription")]//'
            'div[contains(@class, "psTeaser")]//a[contains(@href, "productdetail.aspx")]'
        )
        for rp in rps:
            results.append(
                RelatedProduct(
                    rp.xpath('text()').extract()[0].strip(),
                    rp.xpath('@href').extract()
                    [0].strip()))  # TODO: check if it's a valid format
        # TODO: scrape dynamic related products
        return results

    def parse_buyer_reviews(self, response):
        product = response.meta['product']
        buyer_reviews = self.br.parse_buyer_reviews_per_page(response)
        product['buyer_reviews'] = buyer_reviews
        yield product

    def _get_stock_status(self, response, product):
        oos_element = response.xpath(
            '//a[contains(@class, "smallBlueBodyText")]'
            '[contains(@href, "makeWin")]//text()').extract()
        if oos_element:
            oos_element = oos_element[0].lower()
            if ('temporarily out of stock' in oos_element
                    or 'pre-order' in oos_element):
                product['is_out_of_stock'] = True
                return product
            if 'limited supply available' in oos_element:
                product['is_out_of_stock'] = False
                product['limited_stock'] = LimitedStock(is_limited=True,
                                                        items_left=-1)
                return product

    @staticmethod
    def _get_product_id(response):
        prod_id = re.findall(':productdetails:([\da-zA-Z\-\.]{1,50})\",',
                             response.body_as_unicode())
        if prod_id:
            return prod_id[0]

    def parse_product(self, response):
        prod = response.meta.get('product', SiteProductItem())

        prod['_subitem'] = True

        _ranking = response.meta.get('_ranking', None)
        prod['ranking'] = _ranking
        prod['url'] = response.url

        cond_set(prod, 'title', response.css('h1 ::text').extract())
        prod['price'] = DellProductSpider._parse_price(response)
        prod['image_url'] = DellProductSpider._parse_image(response)

        prod['description'] = DellProductSpider._parse_description(response)
        prod['brand'] = DellProductSpider._parse_brand(response,
                                                       prod.get('title', ''))
        prod['related_products'] = self._related_products(response)
        response.meta['product'] = prod
        is_links, variants = self._parse_variants(response)
        if is_links:
            yield variants.pop(0)
        else:
            cond_set_value(prod, 'variants',
                           self._collect_variants_from_dict(variants))

        if 'This product is currently unavailable.' in response.body_as_unicode(
        ):
            prod['is_out_of_stock'] = True
        else:
            yield self._get_stock_status(response,
                                         prod)  # this should be OOS field

        meta = {'product': prod}
        prod_id = self._get_product_id(response)
        if prod_id:  # first page type
            if response.css('#bazaarVoice').extract():
                meta.update({'br_page_type': 1})
                yield Request(  # reviews request
                    url=self.REVIEW_URL.format(product_id=prod_id),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta)
        buyer_reviews_iframe_src = response.xpath(
            '//iframe[contains(@src,"reviews.htm")]/@src').extract()
        if buyer_reviews_iframe_src:  # second page type
            meta.update({'br_page_type': 2})
            yield Request(  # reviews request
                url=buyer_reviews_iframe_src[0].replace('format=noscript', ''),
                dont_filter=True,
                callback=self.parse_buyer_reviews,
                meta=meta)

        try:
            r_url, related_data = self.RELATED_PROD_URL_V1, self._collect_related_products_data_v1(
                response)
        except Exception:
            r_url, related_data = self.RELATED_PROD_URL_V2, self._collect_related_products_data_v2(
                response)
        yield Request(  # related products request
            r_url.format(**related_data),
            callback=self._parse_related_products,
            meta=meta)

        yield prod

    def _collect_common_variants_data(self, response):
        data = self.VARIANTS_DATA.copy()
        _ = is_empty(
            response.xpath('//meta[@name="Country"]/@content').extract())
        if _:
            data['c'] = _
        _ = is_empty(
            response.xpath('//meta[@name="Language"]/@content').extract())
        if _:
            data['l'] = _
        _ = is_empty(
            response.xpath('//meta[@name="Segment"]/@content').extract())
        if _:
            data['s'] = _
        _ = is_empty(
            response.xpath('//meta[@name="CustomerSet"]/@content').extract())
        if _:
            data['cs'] = _
        _ = is_empty(
            response.xpath('//meta[@name="currentOcId"]/@content').extract())
        if _:
            data['oc'] = _
        else:
            self.log('No "OC" and/or "modId data found" <%s>' % response.url,
                     WARNING)
            return None
        return data

    def _collect_specific_variants_data(self, variant, common_data):
        data = common_data.copy()
        oc = data.get('oc')
        if not oc:
            self.log('No OC data', ERROR)
        uniq_id = is_empty(
            variant.xpath(
                '//input[@value="%s"][contains(@id, "OrderCode")]/@id' %
                oc).extract())
        uniq_id = uniq_id.replace('OrderCode', '')
        mod_id = is_empty(
            variant.xpath('.//span[contains(@class,"spec~%s~")]/@class' %
                          uniq_id).extract())
        mod_id = mod_id.split('~')[-1]
        data['modId'] = mod_id
        data['uiParameters'] = 'mainModuleId=%s&uniqueId=%s' % (mod_id,
                                                                uniq_id)
        return data

    def _collect_variants_from_dict(self, variants):
        if not variants:
            return None
        max_options = 4
        _variants = OrderedDict()
        keys = sorted(variants.keys()[:max_options])
        for tmp in keys:
            _variants[tmp] = variants[tmp]
        options = product(*_variants.values()[:max_options])
        data = []
        for option in options:
            tmp = {}
            for i, key in enumerate(keys):
                tmp[key] = option[i]
            data.append(
                dict(in_stock=None, price=None, selected=None, properties=tmp))
        return data

    def _parse_variant_data(self, response):
        json_resp = hjson.loads(response.body_as_unicode())
        html = json_resp['ModulesHtml']
        html = Selector(text=html)
        add_requests = response.meta.get('additional_requests')
        variants = response.meta['variants']
        cur_var = response.meta['cur_variant']
        choices = html.css('.catContent .optDescription::text').extract()
        variants[cur_var] = choices
        if add_requests:
            next_request = add_requests.pop(0)
            return next_request
        vs = self._collect_variants_from_dict(variants)
        prod = response.meta['product']
        prod['variants'] = vs
        return prod

    def _parse_variants(self, response):
        variants_exist = bool(response.css('#Configurations').extract())
        if variants_exist:
            common_req_params = self._collect_common_variants_data(response)
            variants_names = response.xpath(
                '//div[contains(@class, "specContent")]')
            data = {}
            additional_requests = []
            for v_n in variants_names:
                k = is_empty(
                    v_n.xpath(
                        'normalize-space(preceding-sibling::div[@class="specTitle"][1]/h5/text())'
                    ).extract())
                v = ' '.join(v_n.xpath('span/text()').extract())
                is_ajax = bool(v_n.xpath('div[@class="dropdown"]').extract())
                if is_ajax:
                    form_data = self._collect_specific_variants_data(
                        v_n, common_req_params)
                    meta = response.meta.copy()
                    meta['variants'] = data
                    meta['cur_variant'] = k
                    meta['additional_requests'] = additional_requests
                    meta['product'] = response.meta['product']
                    additional_requests.append(
                        FormRequest(self.VARIANTS_URL,
                                    callback=self._parse_variant_data,
                                    formdata=form_data,
                                    meta=meta))
                else:
                    data[k] = [v]
            if additional_requests:
                return True, additional_requests
            else:
                return False, data
        return None, None

    def _collect_related_products_data_v1(self, response):
        data = dict()
        cur_date = datetime.now()
        js_node = response.xpath(
            '//div[@id="mbox_default"]/following-sibling::script[1]')
        js_data = js_node.xpath('following-sibling::script[1]/text()').re(
            'profile = (\{.*\})')
        js_data = hjson.loads(js_data[0])
        data['p'] = is_empty(
            response.css('meta[name=currentOcId]::attr(content)').extract())
        data['date'] = cur_date.today().strftime('%Y%m%d')
        data['ts'] = '%s000' % int(time.mktime(cur_date.timetuple()))
        data['n'] = js_data['catid']
        data['chi'] = is_empty(
            js_node.xpath('text()').re("'profile.catid=(.*?)'"))
        return data

    def _collect_related_products_data_v2(self, response):
        data = dict()
        js_data = response.xpath(
            'normalize-space(/html/head/script[@type="text/javascript"][1]/text())'
        ).re('\{.*\}')
        js_data = hjson.loads(js_data[0])
        cur_date = datetime.now()
        data['date'] = cur_date.today().strftime('%Y%m%d')
        data['ts'] = '%s000' % int(time.mktime(cur_date.timetuple()))
        data['p'] = js_data['CJ']['ORDERCODE'].lower()
        return data

    def _parse_related_products(self, response):
        prod = response.meta['product']
        html = re.search(r"html:'(.+?)'\}\]\},", response.body_as_unicode())
        if not html:
            return prod
        html = Selector(text=html.group(1))
        key_name = is_empty(html.css('.rrStrat::text').extract())
        items = html.css('.rrRecs > ul > li')
        rel_prods = []
        for item in items:
            title = is_empty(item.css('.rrItemName > a ::text').extract())
            url = is_empty(item.css('a.rrLinkUrl::attr(href)').extract())
            url = urlparse.urlparse(url)
            qs = urlparse.parse_qs(url.query)
            url = is_empty(qs['ct'])
            rel_prods.append(RelatedProduct(title=title, url=url))
        prod['related_products'] = {key_name: rel_prods}
        return prod
Beispiel #14
0
 def __init__(self, *args, **kwargs):
     super(ToysrusProductsSpider,
           self).__init__(site_name=self.allowed_domains[0],
                          *args,
                          **kwargs)
     self.br = BuyerReviewsBazaarApi(called_class=self)
Beispiel #15
0
class CvsProductsSpider(BaseProductsSpider):
    name = 'cvs_products'
    allowed_domains = ["cvs.com", "api.bazaarvoice.com"]
    start_urls = []

    SEARCH_URL = "https://www.cvs.com/search/N-0?searchTerm={search_term}"

    SEARCH_URL_AJAX = "https://www.cvs.com/" \
                      "retail/frontstore/OnlineShopService?" \
                      "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \
                      "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \
                      "appName=CVS_WEB&" \
                      "channelName=WEB&" \
                      "contentZone=resultListZone&" \
                      "deviceToken=7780&" \
                      "deviceType=DESKTOP&" \
                      "lineOfBusiness=RETAIL&" \
                      "navNum=20&" \
                      "operationName=getProductResultList&" \
                      "pageNum={page_num}&" \
                      "referer={referer}&" \
                      "serviceCORS=False&" \
                      "serviceName=OnlineShopService&" \
                      "sortBy=relevance&" \
                      "version=1.0" \


    REVIEW_URL = "http://api.bazaarvoice.com/data/products.json?" \
                 "passkey=ll0p381luv8c3ler72m8irrwo&apiversion=5.5&" \
                 "filter=id:{product_id}&stats=reviews"

    PRICE_URL = "https://www.cvs.com/retail/frontstore/productDetails?" \
                "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \
                "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \
                "appName=CVS_WEB&" \
                "channelName=WEB&" \
                "code={sku}&" \
                "codeType=sku&" \
                "deviceToken=2695&" \
                "deviceType=DESKTOP&" \
                "lineOfBusiness=RETAIL&" \
                "operationName=getSkuPricePromotions&" \
                "serviceCORS=True&" \
                "serviceName=productDetails&" \
                "storeId=2294&" \
                "version=1.0" \

    PRODUCT_DETAILS = "https://www.cvs.com/retail/frontstore/productDetails?" \
                      "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \
                      "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \
                      "appName=CVS_WEB&" \
                      "channelName=WEB&" \
                      "code={sku}&" \
                      "codeType=sku&" \
                      "deviceToken=2695&" \
                      "deviceType=DESKTOP&" \
                      "lineOfBusiness=RETAIL&" \
                      "operationName=getSkuDetails&" \
                      "serviceCORS=True&" \
                      "serviceName=productDetails&" \
                      "version=1.0"

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)
        self.referer = None
        self.first_time_products = None
        self.current_page = 1
        self.products_per_page = 20
        super(CvsProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)
        settings.overrides['CRAWLERA_ENABLED'] = True

    def _set_brand(self, product, phrase, brands):
        phrase = _normalize(phrase)
        for brand in sorted(brands, key=len, reverse=True):
            if _normalize(brand) in phrase:
                cond_set_value(product, 'brand', brand)
                break

    def parse(self, response):
        print response.url
        if self.searchterms and not self.referer:
            self.referer = response.url
        return super(CvsProductsSpider, self).parse(response)

    def parse_product(self, response):
        brands = response.meta.get('brands', frozenset())
        product = response.meta['product']
        reqs = []

        if 'brand' not in product:
            descs = response.css('.brandBanner > a ::attr(title)')
            if descs:
                desc, = descs.extract()
                self._set_brand(product, desc, brands)
        product['locale'] = "en-US"

        reseller_id_regex = "prodid-(\d+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        ld_json = is_empty(
            response.xpath('//*[@type="application/ld+json" '
                           'and contains(text(),"product")]/text()').extract())
        if ld_json:
            try:
                clean_json = re.sub('([^"])\n|\t|\r', '',
                                    ld_json.replace('@', ''))
                product_data = json.loads(clean_json)

                cond_set_value(product, 'title', product_data.get('name'))
                cond_set_value(product, 'brand', product_data.get('brand'))
                ########  variants ########
                variants = product_data.get('offers')
                if len(variants) > 1:
                    for variant in variants:
                        try:
                            sku = variant['itemOffered']['sku']
                            price_url = self.PRICE_URL.format(sku=sku)
                            reqs.append(
                                Request(price_url,
                                        self._parse_variant_new,
                                        meta=response.meta))
                        except:
                            pass

                main_variant = variants[0]
                description = main_variant.get(
                    'itemOffered',
                    {}).get('description') or product_data.get('description')
                cond_set_value(product, 'description', description)

                main_skuID_search = re.search("skuId=(\d+)", response.url)
                if main_skuID_search:
                    main_skuID = main_skuID_search.group(1)
                else:
                    main_skuID = variants[0].get('itemOffered',
                                                 {}).get('sku', None)

                cond_set_value(product, 'image_url',
                               main_variant.get('itemOffered').get('image'))
                response.meta['main_skuID'] = main_skuID
                response.meta['offers_variants'] = variants

                if main_variant.get('price'):
                    cond_set_value(
                        product, 'price',
                        Price(price=main_variant.get('price'),
                              priceCurrency='USD'))

                # elif product_data.get('productId'):
                #     price_url = self.PRICE_URL.format(
                #         price_id=product_data.get('productId'))
                #     reqs.append(Request(price_url,
                #                         self._parse_price,
                #                         meta=response.meta))

                # cond_set_value(product, 'variants',
                #                self._parse_variants(variants, main_skuID))

                ##############################
                if main_skuID:
                    review_url = self.REVIEW_URL.format(product_id=main_skuID)
                    reqs.append(
                        Request(review_url,
                                self._parse_review,
                                meta=response.meta))

            except:
                import traceback
                print traceback.print_exc()

        size = response.xpath(
            "//form[@id='addCart']/table/tr/td[@class='col1']/"
            "text()[.='Size:']/../../td[2]/text()").extract()
        cond_set(product, 'model', size, conv=string.strip)

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs
        return req.replace(meta=new_meta)

    def _parse_variant_new(self, response):
        product = response.meta['product']
        reqs = response.meta.get('reqs', [])
        data = json.loads(response.body)

        sku_price_promotions = data.get('response',
                                        {}).get('getSkuPricePromotions', [])

        if sku_price_promotions:
            sku_details = sku_price_promotions[0].get('skuDetails', [])

        if sku_details:
            variants = product.get('variants', [])
            variant = {}
            skuID = sku_details[0].get('skuId', '')
            variant['url'] = product.get('url', '') + "?skuId=%s" % skuID

            price = sku_details[0].get('priceInfo', {}).get('listPrice', None)
            if price:
                cond_set_value(product, 'price',
                               Price(price=price, priceCurrency='USD'))

            variant['price'] = price
            main_skuID = response.meta['main_skuID']
            variant['selected'] = main_skuID == skuID
            bohInventory = sku_details[0].get('statusInfo',
                                              {}).get('bohInventory', 0)
            bohStockStatus = sku_details[0].get('statusInfo', {}).get(
                'bohStockStatus', 'NOTAVAILABLE')
            onlineOnly = sku_details[0].get('statusInfo',
                                            {}).get('onlineOnly', False)
            onlineStockStatus = sku_details[0].get('statusInfo', {}).get(
                'onlineStockStatus', None)
            in_stock = False
            if bohInventory and bohStockStatus != 'NOTAVAILABLE':
                in_stock = True
            if onlineStockStatus == 'INSTOCK':
                in_stock = True
            variant['in_stock'] = in_stock
            variant['sku'] = skuID
            # del product['main_skuID']
            variant['properties'] = {}
            offers_variants = response.meta['offers_variants']
            for offers_variant in offers_variants:
                # Check that the variant is not duplicated
                item_offered = offers_variant.get('itemOffered', {})
                this_sku = item_offered.get('sku', None)
                if item_offered and this_sku == skuID:
                    attr = {}
                    details_url = self.PRODUCT_DETAILS.format(sku=this_sku)
                    variant['properties'] = attr
                    reqs.append(
                        Request(details_url,
                                self._parse_properties,
                                meta=response.meta))
                    break

            variants.append(variant)
            product['variants'] = variants

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_properties(self, response):
        product = response.meta['product']
        reqs = response.meta.get('reqs', [])
        data = json.loads(response.body)

        getSkuDetails = data.get('response', {}).get('getSkuDetails', [])

        if getSkuDetails:
            sku_details = getSkuDetails[0].get('skuDetails', [])

        if len(sku_details) > 0:
            detail = sku_details[0]['detail']
            skuSize = detail['skuSize']
            weight = detail['weight']
            flavor = detail['flavor']
            upcNumber = detail['upcNumber']

            variants = product.get('variants', [])
            skuID = sku_details[0].get('skuId', '')

            for idx, variant in enumerate(variants):
                # Check that the variant is not duplicated
                this_sku = variant.get('sku', None)
                if this_sku == skuID:
                    attr = {}
                    attr['Size'] = skuSize
                    attr['Flavor'] = flavor
                    attr['Weight'] = weight
                    attr['UPCNumber'] = upcNumber
                    variant['properties'] = attr
                    variants[idx] = variant
                    break

            product['variants'] = variants

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_variants(self, variants, main_skuID):
        if not variants:
            return None

        parsed_variants = []
        variants_visit = set()
        for variant in variants:
            # Check that the variant is not duplicated
            item_offered = variant.get('itemOffered', {})
            this_sku = item_offered.get('sku', None)
            if this_sku in variants_visit:
                continue
            variants_visit.add(this_sku)

            # Fill the Variant data
            vr = {}
            if variant['price']:
                vr['price'] = variant['price']
            availability = variant.get('availability', None)
            vr['in_stock'] = availability == "http://schema.org/InStock"
            vr['selected'] = main_skuID == this_sku
            if item_offered:
                attr = {}
                if item_offered.get('color'):
                    attr['Color'] = item_offered.get('color')
                if item_offered.get('color'):
                    attr['Weight'] = item_offered.get('weight').get('value')
                vr['properties'] = attr
                vr['url'] = item_offered.get('url')
            parsed_variants.append(vr)

        parsed_variants[0]['selected'] = True
        return parsed_variants

    def _parse_review(self, response):
        product = response.meta['product']
        reqs = response.meta.get('reqs', [])
        product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json(
            response)
        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _scrape_total_matches(self, response):
        totals = response.xpath(
            '//*[@id="resultsTabs"]//'
            'a[@title="View Products"]/text()').re('\((\d+)\)')
        if len(totals) > 1:
            self.log(
                "Found more than one 'total matches' for %s" % response.url,
                ERROR)
        elif totals:
            total = totals[0].strip()
            self.total_matches_int = int(total)
            return int(total)
        else:
            self.log("Failed to find 'total matches' for %s" % response.url,
                     WARNING)
        return None

    def _scrape_product_links(self, response):
        all_links_iter = re.finditer(
            'detailsLink"\s*:\s*"(.*?)(\?skuId=\d+)?",', response.body)

        # Clean the links for the different variants of a product
        links_without_dup = []
        [
            links_without_dup.append(item)
            for item in map((lambda x: x.group(1)), all_links_iter)
            if item not in links_without_dup
        ]
        for link in links_without_dup:
            yield link, SiteProductItem()

    def _scrape_results_per_page(self, response):
        return 20

    def _scrape_next_results_page_link(self, response):
        url_parts = urlparse.urlsplit(response.url)
        query_string = urlparse.parse_qs(url_parts.query)

        ajax_search_url = self.SEARCH_URL_AJAX.format(
            referer=urllib.quote_plus(self.referer, ':'),
            page_num=self.current_page)
        self.current_page += 1

        if self.current_page * self.products_per_page > self.total_matches_int + 30:
            return

        headers = {
            'Accept':
            'application/json, text/plain, */*',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Host':
            'www.cvs.com',
            'Pragma':
            'no-cache',
            'Referer':
            self.referer,
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64)'
            ' AppleWebKit/537.36 (KHTML, like Gecko)'
            ' Chrome/49.0.2623.110 Safari/537.36'
        }

        return Request(ajax_search_url,
                       self.parse,
                       headers=headers,
                       meta=response.meta,
                       priority=1)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _get_products(self, response):
        remaining = response.meta['remaining']
        search_term = response.meta['search_term']
        prods_per_page = response.meta.get('products_per_page')
        total_matches = response.meta.get('total_matches')
        scraped_results_per_page = response.meta.get(
            'scraped_results_per_page')

        prods = self._scrape_product_links(response)

        if not prods_per_page:
            # Materialize prods to get its size.
            prods = list(prods)
            prods_per_page = len(prods)
            response.meta['products_per_page'] = prods_per_page

        if scraped_results_per_page is None:
            scraped_results_per_page = self._scrape_results_per_page(response)
            if scraped_results_per_page:
                self.log(
                    "Found %s products at the first page" %
                    scraped_results_per_page, INFO)
            else:
                scraped_results_per_page = prods_per_page
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to scrape number of products per page",
                            ERROR)
            response.meta[
                'scraped_results_per_page'] = scraped_results_per_page

        if total_matches is None:
            total_matches = self._scrape_total_matches(response)
            if total_matches is not None:
                response.meta['total_matches'] = total_matches
                self.log("Found %d total matches." % total_matches, INFO)
            else:
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to parse total matches for %s" %
                            response.url, ERROR)

        if total_matches and not prods_per_page:
            # Parsing the page failed. Give up.
            self.log("Failed to get products for %s" % response.url, ERROR)
            return

        for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)):
            # Initialize the product as much as possible.
            prod_item['site'] = self.site_name
            prod_item['search_term'] = search_term
            prod_item['total_matches'] = total_matches
            prod_item['results_per_page'] = prods_per_page
            prod_item['scraped_results_per_page'] = scraped_results_per_page
            # The ranking is the position in this page plus the number of
            # products from other pages.
            prod_item['ranking'] = (i + 1) + (self.quantity - remaining)
            if self.user_agent_key not in ["desktop", "default"]:
                prod_item['is_mobile_agent'] = True

            if prod_url is None:
                # The product is complete, no need for another request.
                yield prod_item
            elif isinstance(prod_url, Request):
                cond_set_value(prod_item, 'url', prod_url.url)  # Tentative.
                yield prod_url
            else:
                # Another request is necessary to complete the product.
                url = urlparse.urljoin(response.url, prod_url)
                cond_set_value(prod_item, 'url', url)  # Tentative.
                yield Request(url,
                              callback=self.parse_product,
                              meta={'product': prod_item})
Beispiel #16
0
class LeviProductsSpider(BaseValidator, BaseProductsSpider):
    name = 'levi_products'
    allowed_domains = ["levi.com", "www.levi.com"]
    start_urls = []

    settings = LeviValidatorSettings

    SEARCH_URL = "http://www.levi.com/US/en_US/search?Ntt={search_term}"  # TODO: ordering

    PAGINATE_URL = (
        'http://www.levi.com/US/en_US/includes/searchResultsScroll/?nao={nao}'
        '&url=%2FUS%2Fen_US%2Fsearch%3FD%3D{search_term}%26Dx'
        '%3Dmode%2Bmatchall%26N%3D4294960840%2B4294961101%2B4294965619%26Ntk'
        '%3DAll%26Ntt%3Ddress%26Ntx%3Dmode%2Bmatchall')

    CURRENT_NAO = 0
    PAGINATE_BY = 12  # 12 products
    TOTAL_MATCHES = None  # for pagination

    REVIEW_URL = "http://levistrauss.ugc.bazaarvoice.com/9090-en_us/" \
                 "{product_id}/reviews.djs?format=embeddedhtml&page={index}&"

    RELATED_PRODUCT = "http://www.res-x.com/ws/r2/Resonance.aspx?" \
                      "appid=levi01&tk=811541814822703" \
                      "&ss=544367773691192" \
                      "&sg=1&" \
                      "&vr=5.3x&bx=true" \
                      "&sc=product4_rr" \
                      "&sc=product3_rr" \
                      "&sc=product1_r" \
                      "r&sc=product2_rr" \
                      "&ev=product&ei={product_id}" \
                      "&no=20" \
                      "&language=en_US" \
                      "&cb=certonaResx.showResponse" \
                      "&ur=http%3A%2F%2Fwww.levi.com%2FUS%2Fen_US%" \
                      "2Fwomens-jeans%2Fp%2F095450043&plk=&"

    handle_httpstatus_list = [404]

    use_proxies = True

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(LeviProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def parse_product(self, response):
        product = response.meta.get('product', SiteProductItem())

        if response.status == 404 or 'This product is no longer available' in response.body_as_unicode() \
                or "www.levi.com/US/en_US/error" in response.url:
            product.update({"not_found": True})
            product.update({"no_longer_available": True})
            return product

        reqs = []

        # product id
        self.product_id = is_empty(
            response.xpath('//meta[@itemprop="model"]/@content').extract())

        # product data in json
        self.js_data = self.parse_data(response)

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse model
        cond_set_value(product, 'model', self.product_id)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title)

        # Parse image
        image = self.parse_image(response)
        cond_set_value(product, 'image_url', image)

        # Parse brand
        brand = self.parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Parse upc
        upc = self.parse_upc(response)
        cond_set_value(product, 'upc', upc)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse description
        description = self.parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        reseller_id_regex = "p\/(\d+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        try:
            variants = self._parse_variants(response)
        except KeyError:
            product['not_found'] = True
            return product

        # set reseller_id for variants as well
        for variant in variants:
            v_url = variant.get('url')
            if v_url:
                reseller_id = re.findall(reseller_id_regex, v_url)
                reseller_id = reseller_id[0] if reseller_id else None
            else:
                reseller_id = None
            variant['reseller_id'] = reseller_id

        product['variants'] = variants

        response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0}
        real_count = is_empty(
            re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>',
                       response.body_as_unicode()))
        if real_count:
            # Parse buyer reviews
            if int(real_count) > 8:
                for index, i in enumerate(xrange(9, int(real_count) + 1, 30)):
                    reqs.append(
                        Request(url=self.REVIEW_URL.format(
                            product_id=self.product_id, index=index + 2),
                                dont_filter=True,
                                callback=self.parse_buyer_reviews))

        reqs.append(
            Request(url=self.REVIEW_URL.format(product_id=self.product_id,
                                               index=0),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews))

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_variants(self, response):
        """
        Parses product variants.
        """
        lv = LeviVariants()
        lv.setupSC(response)
        variants = lv._variants()

        return variants

    def parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response)

        for k, v in buyer_reviews_per_page['rating_by_star'].iteritems():
            response.meta['marks'][k] += v

        product = response.meta['product']
        reqs = meta.get('reqs')

        product['buyer_reviews'] = BuyerReviews(
            num_of_reviews=buyer_reviews_per_page['num_of_reviews'],
            average_rating=buyer_reviews_per_page['average_rating'],
            rating_by_star=response.meta['marks'])
        if reqs:
            reqs.append(
                Request(url=self.RELATED_PRODUCT.format(
                    product_id=self.product_id, index=0),
                        dont_filter=True,
                        callback=self.parse_related_product))

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """

        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)

    def parse_brand(self, response):
        brand = is_empty(
            response.xpath('//meta[@itemprop="brand"]/@content').extract())

        return brand

    def parse_title(self, response):
        title = response.xpath(
            '//meta[contains(@property, "og:title")]/@content').extract()
        if title:
            title = [title[0].replace('&trade;', '').replace('\u2122', '')]
        else:
            title = response.xpath(
                '//h1[contains(@class, "title")]/text()').extract()
        return title

    def parse_data(self, response):
        data = re.findall(r'var buyStackJSON = \'(.+)\'; ',
                          response.body_as_unicode())
        if data:
            data = re.sub(r'\\(.)', r'\g<1>', data[0])
            try:
                js_data = json.loads(data)
            except:
                return
            return js_data

    def parse_image(self, response):
        if self.js_data:
            try:
                image = self.js_data['colorid'][self.product_id]['gridUrl']
            except:
                return

            return image

    def parse_related_product(self, response):
        related_prods = []
        product = response.meta['product']
        sample = response.body_as_unicode()
        try:
            sample = sample.replace(u'certonaResx.showResponse(', '')
            sample = sample[:-2]
            data = json.loads(sample)
            html = data['Resonance']['Response'][2]['output']
        except Exception as e:
            self.log(
                'Error during parsing related products page: {}'.format(e))
            return product
        else:
            s = Selector(text=html)
            titles = s.xpath('//h4/text()').extract()  # Title
            urls = s.xpath('//img/@src').extract()  # Img url
            for title, url in zip(titles, urls):
                if url and title:
                    related_prods.append(RelatedProduct(title=title, url=url))
            product['related_products'] = {}
            if related_prods:
                product['related_products'][
                    'buyers_also_bought'] = related_prods
            return product

    def parse_description(self, response):
        if self.js_data:
            try:
                description = self.js_data['colorid'][self.product_id]['name']
            except:
                return
            return description

    def parse_upc(self, response):
        if self.js_data:
            for v in self.js_data['sku'].values():
                upc = v['upc']
            upc = upc[-12:]

            return upc

    def parse_sku(self, response):
        if self.js_data:
            for v in self.js_data['sku'].values():
                skuid = v['skuid']

            return skuid

    def parse_price(self, response):
        if self.js_data:
            price = self.js_data['colorid'][self.product_id]['price']
            for price_data in price:
                if price_data['il8n'] == 'now':
                    price = price_data['amount']
            currency = is_empty(
                re.findall(r'currency":"(\w+)"', response.body_as_unicode()))

            if price and currency:
                price = Price(price=price, priceCurrency=currency)
            else:
                price = Price(price=0.00, priceCurrency="USD")

            return price

    def _scrape_total_matches(self, response):
        totals = response.css('.productCount ::text').extract()
        if totals:
            totals = totals[0].replace(',', '').replace('.', '').strip()
            if totals.isdigit():
                if not self.TOTAL_MATCHES:
                    self.TOTAL_MATCHES = int(totals)
                return int(totals)

    def _scrape_product_links(self, response):
        for link in response.xpath(
                '//li[contains(@class, "product-tile")]'
                '//a[contains(@rel, "product")]/@href').extract():
            yield link, SiteProductItem()

    def _get_nao(self, url):
        nao = re.search(r'nao=(\d+)', url)
        if not nao:
            return
        return int(nao.group(1))

    def _replace_nao(self, url, new_nao):
        current_nao = self._get_nao(url)
        if current_nao:
            return re.sub(r'nao=\d+', 'nao=' + str(new_nao), url)
        else:
            return url + '&nao=' + str(new_nao)

    def _scrape_next_results_page_link(self, response):
        if self.TOTAL_MATCHES is None:
            self.log('No "next result page" link!')
            return
        if self.CURRENT_NAO > self.TOTAL_MATCHES + self.PAGINATE_BY:
            return  # it's over
        self.CURRENT_NAO += self.PAGINATE_BY
        return Request(self.PAGINATE_URL.format(
            search_term=response.meta['search_term'],
            nao=str(self.CURRENT_NAO)),
                       callback=self.parse,
                       meta=response.meta)
Beispiel #17
0
class CostcoProductsSpider(BaseProductsSpider):
    name = "costco_products"
    allowed_domains = ["costco.com"]
    start_urls = []

    SEARCH_URL = "http://www.costco.com/CatalogSearch?pageSize=96" \
        "&catalogId=10701&langId=-1&storeId=10301" \
        "&currentPage=1&keyword={search_term}"
    selenium_retries = 5
    DEFAULT_CURRENCY = u'USD'

    REVIEW_URL = 'http://api.bazaarvoice.com/data/products.json?passkey=bai25xto36hkl5erybga10t99&apiversion=5.5&filter=id:{product_id}&stats=reviews'

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(CostcoProductsSpider, self).__init__(
            site_name=self.allowed_domains[0], *args, **kwargs)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def parse_product(self, response):
        prod = response.meta['product']

        meta = response.meta.copy()
        reqs = []
        meta['reqs'] = reqs

        # TODO since response.body is already downloaded by scrapy
        # may try to run it in selenium instead of downloading the page again
        selenium_html = self._get_page_html_selenium(response.url)
        # TODO might as well use that html to extract other data

        for x in range(self.selenium_retries - 1):
            if not selenium_html:
                selenium_html = self._get_page_html_selenium(response.url)
            else:
                break

        if selenium_html:
            price = Selector(text=selenium_html).xpath(
                './/*[contains(@class, "your-price")]/span[@class="value"]/text()').extract()
            cond_set_value(prod, 'price', Price(priceCurrency=self.DEFAULT_CURRENCY,
                                                price=price))

        # not longer available
        no_longer_available = response.xpath(
            '//*[@class="server-error" and contains(text(),'
            '"out of stock and cannot be added to your cart at this time")]')
        cond_set_value(prod, 'no_longer_available', 1 if no_longer_available else 0)

        if not no_longer_available and response.xpath('//h1[text()="Product Not Found"]'):
            prod['not_found'] = True
            return prod

        model = response.xpath('//div[@id="product-tab1"]//text()').re(
            'Model[\W\w\s]*')
        if len(model) > 0:
            cond_set(prod, 'model', model)
            if 'model' in prod:
                prod['model'] = re.sub(r'Model\W*', '', prod['model'].strip())

        title = response.xpath('//h1[@itemprop="name"]/text()').extract()
        cond_set(prod, 'title', title)

        # Title key must be present even if it is blank
        cond_set_value(prod, 'title', "")

        tab2 = ''.join(
            response.xpath('//div[@id="product-tab2"]//text()').extract()
        ).strip()
        brand = ''
        for i in tab2.split('\n'):
            if 'Brand' in i.strip():
                brand = i.strip()
        brand = re.sub(r'Brand\W*', '', brand)
        if brand:
            prod['brand'] = brand
        if not prod.get("brand"):
            brand = response.xpath(
                    './/*[contains(text(), "Brand:")]/following-sibling::text()[1]').extract()
            brand = brand[0].strip() if brand else None
            cond_set_value(prod, 'brand', brand)

        des = response.xpath('//div[@id="product-tab1"]//text()').extract()
        des = ' '.join(i.strip() for i in des)
        if '[ProductDetailsESpot_Tab1]' in des.strip():
            des = response.xpath("//div[@id='product-tab1']/*[position()>1]//text()").extract()
            des = ' '.join(i.strip() for i in des)
            if des.strip():
                prod['description'] = des.strip()

        elif des:
            prod['description'] = des.strip()

        img_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        cond_set(prod, 'image_url', img_url)

        cond_set_value(prod, 'locale', 'en-US')
        prod['url'] = response.url

        # Categories
        categorie_filters = ['home']
        # Clean and filter categories names from breadcrumb
        categories = list(filter((lambda x: x.lower() not in categorie_filters),
                        map((lambda x: x.strip()), response.xpath('//*[@itemprop="breadcrumb"]//a/text()').extract())))

        category = categories[-1] if categories else None

        cond_set_value(prod, 'categories', categories)
        cond_set_value(prod, 'category', category)

        # Minimum Order Quantity
        try:
            minium_order_quantity = re.search('Minimum Order Quantity: (\d+)', response.body_as_unicode()).group(1)
            cond_set_value(prod, 'minimum_order_quantity', minium_order_quantity)
        except:
            pass

        shipping = ''.join(response.xpath(
            '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",'
            ' "abcdefghijklmnopqrstuvwxyz"), "shipping & handling:")]'
        ).re('[\d\.\,]+')).strip().replace(',', '')
        if not shipping:
            shipping = ''.join(response.xpath(
            '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",'
            ' "abcdefghijklmnopqrstuvwxyz"), "shipping and handling:")]'
        ).re('[\d\.\,]+')).strip().replace(',', '')

        if shipping:
            cond_set_value(prod, 'shipping_cost', Price(priceCurrency=self.DEFAULT_CURRENCY,
                                                        price=shipping))

        shipping_included = ''.join(response.xpath(
            '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",'
            ' "abcdefghijklmnopqrstuvwxyz"),"shipping & handling included")]'
        ).extract()).strip().replace(',', '') or \
            response.xpath(
                '//*[@class="merchandisingText" and '
                'contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", '
                '"abcdefghijklmnopqrstuvwxyz"), "free shipping")]') or \
            ''.join(response.xpath(
                '//p[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",'
                ' "abcdefghijklmnopqrstuvwxyz"),"shipping and handling included")]'
            ).extract()).strip().replace(',', '')

        cond_set_value(prod, 'shipping_included', 1 if shipping_included or shipping == "0.00" else 0)

        available_store = re.search('Item may be available in your local warehouse', response.body_as_unicode())
        cond_set_value(prod, 'available_store', 1 if available_store else 0)

        not_available_store = re.search('Not available for purchase on Costco.com', response.body_as_unicode())
        cond_set_value(prod, 'available_online', 0 if not_available_store else 1)

        if str(prod.get('available_online', None)) == '0' and str(prod.get('available_store', None)) == '0':
            prod['is_out_of_stock'] = True

        count_review = response.xpath('//meta[contains(@itemprop, "reviewCount")]/@content').extract()
        product_id = re.findall(r'\.(\d+)\.', response.url)
        cond_set_value(prod, 'reseller_id', product_id[0] if product_id else None)

        if product_id and count_review:
            reqs.append(
                Request(
                    url=self.REVIEW_URL.format(product_id=product_id[0], index=0),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta
                ))

        if reqs:
            return self.send_next_request(reqs, response)

        return prod

    def parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json(response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)

    def _search_page_error(self, response):
        if not self._scrape_total_matches(response):
            self.log("Costco: unable to find a match", ERROR)
            return True
        return False

    def _scrape_total_matches(self, response):
        count = response.xpath(
            '//*[@id="secondary_content_wrapper"]/div/p/span/text()'
        ).re('(\d+)')
        count = int(count[-1]) if count else None
        if not count:
            count = response.xpath(
                '//*[@id="secondary_content_wrapper"]'
                '//span[contains(text(), "Showing results")]/text()'
            ).extract()
            count = int(count[0].split(' of ')[1].replace('.', '').strip()) if count else None
        if not count:
            count = response.css(".table-cell.results.hidden-xs.hidden-sm.hidden-md>span").re(
                r"Showing\s\d+-\d+\s?of\s?([\d.,]+)")
            count = int(count[0].replace('.', '').replace(',', '')) if count else None
        return count

    def _scrape_product_links(self, response):
        links = response.xpath(
            '//div[contains(@class,"product-list grid")]//a[contains(@class,"thumbnail")]/@href'
        ).extract()
        for link in links:
            yield link, SiteProductItem()

    def _scrape_next_results_page_link(self, response):
        links = response.xpath(
            "//*[@class='pagination']"
            "/ul[2]"  # [1] is for the Items Per Page section which has .active.
            "/li[@class='active']"
            "/following-sibling::li[1]"  # [1] is to get just the next sibling.
            "/a/@href"
        ).extract()
        if links:
            link = links[0]
        else:
            link = None

        return link

    def _get_page_html_selenium(self, url):
        try:
            display = Display(visible=False)
            display.start()
            driver = self._init_chromium()
            driver.set_page_load_timeout(120)
            driver.set_script_timeout(120)
            socket.setdefaulttimeout(120)
            driver.set_window_size(1280, 768)
            driver.get(url)
            time.sleep(5)
            page_html = driver.page_source
            driver.quit()
        except Exception as e:
            self.log('Exception while getting page html with selenium: {}'.format(e), WARNING)
            self.log('### Traceback: {}'.format(traceback.format_exc()), WARNING)
        else:
            return page_html

    def _init_chromium(self, proxy=None, proxy_type=None):
        # TODO use random useragent script here?
        # UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0"
        chrome_flags = webdriver.DesiredCapabilities.CHROME  # this is for Chrome?
        chrome_options = webdriver.ChromeOptions()  # this is for Chromium
        if proxy:
            chrome_options.add_argument(
                '--proxy-server=%s' % proxy_type+'://'+proxy)
        # chrome_flags["chrome.switches"] = ['--user-agent=%s' % UA]
        # chrome_options.add_argument('--user-agent=%s' % UA)
        executable_path = '/usr/sbin/chromedriver'
        if not os.path.exists(executable_path):
            executable_path = '/usr/local/bin/chromedriver'
        # initialize webdriver
        driver = webdriver.Chrome(desired_capabilities=chrome_flags,
                                  chrome_options=chrome_options,
                                  executable_path=executable_path)
        return driver
class OfficedepotProductsSpider(BaseProductsSpider):
    name = 'officedepot_products'
    allowed_domains = [
        "officedepot.com", "www.officedepot.com", 'bazaarvoice.com'
    ]
    start_urls = []
    _extra_requests = False
    # settings = DockersValidatorSettings

    SEARCH_URL = "http://www.officedepot.com/catalog/search.do?Ntt={search_term}&searchSuggestion=true&akamai-feo=off"

    PAGINATE_URL = (
        'http://www.officedepot.com/catalog/search.do?Ntx=mode+matchpartialmax&Nty=1&Ntk=all'
        '&Ntt={search_term}&N=5&recordsPerPageNumber=24&No={nao}')

    CURRENT_NAO = 0
    PAGINATE_BY = 24  # 24 products
    TOTAL_MATCHES = None  # for pagination

    REVIEW_URL = "http://officedepot.ugc.bazaarvoice.com/2563" \
                 "/{product_id}/reviews.djs?format=embeddedhtml"

    VARIANTS_URL = 'http://www.officedepot.com/mobile/getSkuAvailable' \
            'Options.do?familyDescription={name}&sku={sku}&noLogin=true'
    QA_URL = "http://officedepot.ugc.bazaarvoice.com/answers/2563/product/{product_id}/questions.djs?format=embeddedhtml"

    #
    # RELATED_PRODUCT = "http://www.res-x.com/ws/r2/Resonance.aspx?" \
    #                   "appid=dockers01&tk=187015646137297" \
    #                   "&ss=182724939426407" \
    #                   "&sg=1&" \
    #                   "&vr=5.3x&bx=true" \
    #                   "&sc=product4_rr" \
    #                   "&sc=product3_rr" \
    #                   "&sc=product1_r" \
    #                   "r&sc=product2_rr" \
    #                   "&ev=product&ei={product_id}" \
    #                   "&no=20" \
    #                   "&language=en_US" \
    #                   "&cb=certonaResx.showResponse" \
    #                   "&ur=http%3A%2F%2Fwww.levi.com%2FUS%2Fen_US%" \
    #                   "2Fwomens-jeans%2Fp%2F095450043&plk=&"

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)
        # officedepot seems to have a bot protection, so we first get the cookies
        # and parse the site with them after that
        self.proxy = None
        self.timeout = 60
        self.width = 1024
        self.height = 768
        self.selenium_cookies = {}
        self.user_agent = (
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
            ' (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36')
        socket.setdefaulttimeout(60)
        self._get_selenium_cookies_for_main_page()
        if kwargs.get('scrape_variants_with_extra_requests'):
            self._extra_requests = True
        super(OfficedepotProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def _prepare_driver(self, driver):
        driver.set_page_load_timeout(int(self.timeout))
        driver.set_script_timeout(int(self.timeout))
        driver.set_window_size(int(self.width), int(self.height))

    def _get_selenium_cookies_for_main_page(self):
        from pyvirtualdisplay import Display
        display = Display(visible=False)
        display.start()
        driver = self._init_chromium()
        self._prepare_driver(driver)
        try:
            driver.get('http://' + self.allowed_domains[0])
            time.sleep(10)
            for cookie in driver.get_cookies():
                self.selenium_cookies[cookie['name']] = cookie['value']
            driver.quit()
        except Exception as e:
            driver.quit()
            time.sleep(10)
            self.log(
                'Error getting cookies from homepage, trying one more time: %s'
                % str(e))
            driver.get('http://' + self.allowed_domains[0])
            time.sleep(10)
            for cookie in driver.get_cookies():
                self.selenium_cookies[cookie['name']] = cookie['value']
        try:
            driver.quit()
            display.stop()
        except Exception as e:
            self.log('Error on driver & display destruction: %s' % str(e))

    def _init_chromium(self):
        from selenium import webdriver
        from selenium.webdriver.remote.remote_connection import RemoteConnection
        RemoteConnection.set_timeout(30)
        chrome_flags = webdriver.DesiredCapabilities.CHROME  # this is for Chrome?
        chrome_options = webdriver.ChromeOptions()  # this is for Chromium
        if self.proxy:
            chrome_options.add_argument('--proxy-server=%s' % self.proxy_type +
                                        '://' + self.proxy)
        chrome_flags["chrome.switches"] = ['--user-agent=%s' % self.user_agent]
        chrome_options.add_argument('--user-agent=%s' % self.user_agent)
        executable_path = '/usr/sbin/chromedriver'
        if not os.path.exists(executable_path):
            executable_path = '/usr/local/bin/chromedriver'
        # initialize webdriver, open the page and make a screenshot
        driver = webdriver.Chrome(desired_capabilities=chrome_flags,
                                  chrome_options=chrome_options,
                                  executable_path=executable_path)
        return driver

    def _init_firefox(self):
        from selenium import webdriver
        from selenium.webdriver.remote.remote_connection import RemoteConnection
        RemoteConnection.set_timeout(30)
        profile = webdriver.FirefoxProfile()
        profile.set_preference("general.useragent.override", self.user_agent)
        profile.set_preference("network.proxy.type",
                               1)  # manual proxy configuration
        if self.proxy:
            if 'socks' in self.proxy_type:
                profile.set_preference("network.proxy.socks",
                                       self.proxy.split(':')[0])
                profile.set_preference("network.proxy.socks_port",
                                       int(self.proxy.split(':')[1]))
            else:
                profile.set_preference("network.proxy.http",
                                       self.proxy.split(':')[0])
                profile.set_preference("network.proxy.http_port",
                                       int(self.proxy.split(':')[1]))
        profile.update_preferences()
        driver = webdriver.Firefox(profile)
        return driver

    def _parse_single_product(self, response):
        return self.parse_product(response)

    @staticmethod
    def _get_product_id(url):
        match = re.search(r'/products/(\d{2,20})/', url)
        if match:
            return match.group(1)

    def parse_product(self, response):
        meta = response.meta
        product = meta.get('product', SiteProductItem())
        reqs = []
        meta['reqs'] = reqs

        product['_subitem'] = True

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title, conv=string.strip)

        # Parse image
        image = self.parse_image(response)
        cond_set(product, 'image_url', image)

        # Parse brand
        brand = self.parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse description
        description = self.parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse model
        model = self._parse_model(response)
        cond_set_value(product, 'model', model)

        # Parse reseller_id
        reseller_id = self.parse_reseller_id(response)
        cond_set_value(product, "reseller_id", reseller_id)

        # Parse is out of stock
        oos = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', oos)

        # Parse categories and category
        categories = self._parse_categories(response)
        cond_set_value(product, 'categories', categories)
        if categories:
            cond_set_value(product, 'category', categories[-1])

        # Parse related products
        related_product = self._parse_related_product(response)
        cond_set_value(product, 'related_products', related_product)

        br_count = is_empty(
            re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>',
                       response.body_as_unicode()))
        meta['_br_count'] = br_count
        meta['product'] = product

        reqs.append(
            Request(url=self.REVIEW_URL.format(
                product_id=self._get_product_id(response.url)),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta))

        sku = is_empty(response.xpath('//input[@name="id"]/@value').extract())
        name = is_empty(
            response.xpath('//h1[@itemprop="name"]/text()').re('(.*?),'))

        if sku and name and self.scrape_variants_with_extra_requests:
            name = urllib.quote_plus(name.strip().encode('utf-8'))
            reqs.append(
                Request(url=self.VARIANTS_URL.format(name=name, sku=sku),
                        callback=self._parse_variants,
                        meta=meta))
        # parse questions & answers
        reqs.append(
            Request(url=self.QA_URL.format(
                product_id=self._get_product_id(response.url)),
                    callback=self._parse_questions,
                    meta=meta,
                    dont_filter=True))

        if reqs:
            return self.send_next_request(reqs, response)
        return product

    def parse_reseller_id(self, response):
        regex = "\/(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        return reseller_id

    def _parse_questions(self, response):
        meta = response.meta
        reqs = response.meta['reqs']
        product = response.meta['product']
        qa = []
        questions_ids_regex = """BVQAQuestionSummary.+?javascript:void.+?>([^<]+)[^"']+["']BVQAQuestionMain(\d+)(?:.+?BVQAQuestionDetails.+?div>([^<]+)?).+?BVQAElapsedTime.+?>([^<]+)"""
        questions_ids = re.findall(questions_ids_regex,
                                   response.body_as_unicode())
        for (question_summary, question_id, question_details,
             question_date) in questions_ids:
            # Convert date format
            if question_date:
                try:
                    from dateutil.relativedelta import relativedelta
                    years = re.findall("(\d+?)\s+?years", question_date)
                    years = years[0] if years else '0'
                    years = int(years) if years.isdigit() else '0'
                    months = re.findall("(\d+?)\s+?months", question_date)
                    months = months[0] if months else '0'
                    months = int(months) if months.isdigit() else '0'
                    if not months and not years:
                        converted_date = None
                    else:
                        converted_date = datetime.now() - relativedelta(
                            years=years, months=months)
                        converted_date = converted_date.strftime("%Y-%m-%d")
                except Exception as e:
                    converted_date = None
                    self.log(
                        'Failed to parse date, setting date to None {}'.format(
                            e))
            else:
                converted_date = None
            # regex to get part of response that contain all answers to question with given id
            text_r = "BVQAQuestion{}Answers(.+?)BVQAQuestionDivider".format(
                question_id)
            all_a_text = re.findall(text_r, response.body_as_unicode())
            all_a_text = ''.join(all_a_text[0]) if all_a_text else ''
            answers_regex = r"Answer:.+?>([^<]+)"
            answers = re.findall(answers_regex, all_a_text)
            answers = [{'answerText': a} for a in answers]
            question = {
                'questionDate':
                converted_date,
                'questionId':
                question_id,
                'questionDetail':
                question_details.strip() if question_details else '',
                'qestionSmmary':
                question_summary.strip() if question_summary else '',
                'answers':
                answers,
                'totalAnswersCount':
                len(answers)
            }
            qa.append(question)
        product['all_questions'] = qa
        if reqs:
            return self.send_next_request(reqs, response)
        return product

    def clear_text(self, str_result):
        return str_result.replace("\t", "").replace("\n", "").replace(
            "\r", "").replace(u'\xa0', ' ').strip()

    def _parse_is_out_of_stock(self, response):
        oos = response.xpath('//*[@itemprop="availability"'
                             ' and @content="http://schema.org/OutOfStock"]')
        return bool(oos)

    def _parse_model(self, response):
        model = response.xpath(
            '//*[@id="attributemodel_namekey"]/text()').extract()
        if model:
            return model[0].strip()

    def _parse_categories(self, response):
        categories = response.xpath('//*[@id="siteBreadcrumb"]//'
                                    'span[@itemprop="name"]/text()').extract()
        return categories

    def _parse_related_product(self, response):
        results = []
        base_url = response.url
        for related_product in response.xpath(
                '//*[@id="relatedItems"]'
                '//tr[contains(@class,"hproduct")]'
                '/td[@class="description"]/a'):
            name = is_empty(related_product.xpath('text()').extract())
            url = is_empty(related_product.xpath('@href').extract())
            if name and url:
                results.append(
                    RelatedProduct(title=name,
                                   url=urlparse.urljoin(base_url, url)))
        return results

    def _parse_variants(self, response):
        """
        Parses product variants.
        """
        reqs = response.meta['reqs']
        product = response.meta['product']
        data = json.loads(response.body)
        variants = []

        if data.get('success'):
            for sku in data.get('skus', []):
                vr = {}
                vr['url'] = urlparse.urljoin(response.url, sku.get('url'))
                vr['skuId'] = sku.get('sku')
                price = is_empty(
                    re.findall('\$([\d\.]+)',
                               sku.get('attributesDescription', '')))
                if price:
                    vr['price'] = price

                name = sku.get('description', '')
                if name:
                    vr['properties'] = {'title': name}

                vr['image_url'] = sku.get('thumbnailImageUrl').split('?')[0]
                variants.append(vr)

            product['variants'] = variants
        if product.get('variants') and self._extra_requests:
            variants_urls = [p.get('url') for p in product['variants']]
            for var_url in variants_urls:
                req = Request(url=var_url,
                              callback=self._parse_in_stock_for_variants)
                req.meta['product'] = product
                reqs.append(req)
        if reqs:
            return self.send_next_request(reqs, response)

        return product

    # parse variants one by one and set out of stock status for each variant
    def _parse_in_stock_for_variants(self, response):
        reqs = response.meta['reqs']
        product = response.meta['product']
        oos = self._parse_is_out_of_stock(response)
        for variant in product['variants']:
            if variant['url'] == response.url:
                variant['in_stock'] = not oos
                break
        if reqs:
            return self.send_next_request(reqs, response)
        return product

    def parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        reqs = meta['reqs']

        self.br.br_count = meta['_br_count']
        buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response)

        product = response.meta['product']
        product['buyer_reviews'] = BuyerReviews(**buyer_reviews_per_page)

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)

    def parse_brand(self, response):
        brand = is_empty(
            response.xpath('//td[@itemprop="brand"]/@content').extract())
        if not brand:
            brand = is_empty(
                response.xpath('//td[@itemprop="brand"]/text()').extract())
        if brand:
            brand = brand.strip()
        return brand

    def parse_title(self, response):
        title = response.xpath(
            '//h1[contains(@itemprop, "name")]/text()').extract()
        return title

    def parse_data(self, response):
        data = re.findall(r'var MasterTmsUdo \'(.+)\'; ',
                          response.body_as_unicode())
        if data:
            data = re.sub(r'\\(.)', r'\g<1>', data[0])
            try:
                js_data = json.loads(data)
            except:
                return
            return js_data

    def parse_image(self, response):
        img = response.xpath(
            '//img[contains(@id, "mainSkuProductImage")]/@src').extract()
        return img

    def parse_description(self, response):
        description = response.xpath(
            '//div[contains(@class, "sku_desc")]').extract()
        if description:
            return self.clear_text(description[0])
        else:
            return ''

    def parse_sku(self, response):
        sku = response.xpath(
            '//td[contains(@id, "basicInfoManufacturerSku")]/text()').extract(
            )
        # sku = response.xpath('//div[contains(@id, "skuValue")]/text()').extract()
        if sku:
            return self.clear_text(sku[0])

    def parse_price(self, response):

        price = response.xpath(
            '//meta[contains(@itemprop, "price")]/@content').extract()
        currency = response.xpath(
            '//meta[contains(@itemprop, "priceCurrency")]/@content').extract()

        if price and currency:
            price = Price(price=price[0], priceCurrency=currency[0])
        else:
            price = Price(price=0.00, priceCurrency="USD")

        return price

    def parse_paginate_link(self, response, nao):
        check_page = '&No=%s' % nao
        for link in response.xpath(
                '//a[contains(@class, "paging")]/@href').extract():
            if check_page in link:
                u = urlparse.urlparse(link)
                return urlparse.urljoin('http://www.officedepot.com', u.path)

    def parse_category_link(self, response):
        categories_links = []
        for link in response.xpath(
                '//div[contains(@class, "category_wrapper")]/a[contains(@class, "link")]/@href'
        ).extract():
            categories_links.append(link)

    def _scrape_total_matches(self, response):
        totals = response.xpath(
            '//div[contains(@id, "resultCnt")]/text()').extract()
        if totals:
            totals = totals[0].replace(',', '').replace('.', '').strip()
            if totals.isdigit():
                if not self.TOTAL_MATCHES:
                    self.TOTAL_MATCHES = int(totals)
                return int(totals)

    def _scrape_product_links(self, response):
        items = response.xpath(
            '//div[contains(@class, "descriptionFull")]/'
            'a[contains(@class, "med_txt")]/@href').extract() or response.css(
                '.desc_text a::attr("href")').extract()
        # Scraper was redirected to product page instead of search results page
        if not items and "officedepot.com/a/products" in response.url:
            prod = SiteProductItem(search_redirected_to_product=True)
            # TODO we may not need any data for product aside from "search_redirected_to_product" flag.
            # Rework if that's the case - CON-28287
            req = Request(response.url,
                          callback=self.parse_product,
                          dont_filter=True)
            req.meta["remaining"] = 0
            req.meta['product'] = prod
            yield req, prod
        else:
            for link in items:
                yield link, SiteProductItem()

    def _get_nao(self, url):
        nao = re.search(r'nao=(\d+)', url)
        if not nao:
            return
        return int(nao.group(1))

    def _replace_nao(self, url, new_nao):
        current_nao = self._get_nao(url)
        if current_nao:
            return re.sub(r'nao=\d+', 'nao=' + str(new_nao), url)
        else:
            return url + '&nao=' + str(new_nao)

    def _scrape_next_results_page_link(self, response):
        if self.TOTAL_MATCHES is None:
            self.log('No "next result page" link!')
            # # TODO: check result by categories
            # return self.parse_category_link(response)
            return
        #if self.CURRENT_NAO > self.TOTAL_MATCHES+self.PAGINATE_BY:
        #    return  # all the products have been collected
        if self.CURRENT_NAO > self.quantity + self.PAGINATE_BY:
            return  # num_products > quantity
        self.CURRENT_NAO += self.PAGINATE_BY
        if '/a/browse/' in response.url:  # paginate in category or subcategory
            new_paginate_url = self.parse_paginate_link(
                response, self.CURRENT_NAO)
            if new_paginate_url:
                return Request(new_paginate_url,
                               callback=self.parse,
                               meta=response.meta,
                               cookies=self.selenium_cookies)
        return Request(self.PAGINATE_URL.format(
            search_term=response.meta['search_term'],
            nao=str(self.CURRENT_NAO)),
                       callback=self.parse,
                       meta=response.meta,
                       cookies=self.selenium_cookies)
Beispiel #19
0
class HomedepotProductsSpider(BaseValidator, BaseProductsSpider):
    name = 'homedepot_products'
    allowed_domains = ["homedepot.com", "origin.api-beta.homedepot.com"]
    start_urls = []

    settings = HomedepotValidatorSettings

    SEARCH_URL = "http://www.homedepot.com/s/{search_term}?NCNI-5"
    DETAILS_URL = "http://www.homedepot.com/p/getSkuDetails?itemId=%s"
    REVIEWS_URL = "http://homedepot.ugc.bazaarvoice.com/1999m/%s/" \
        "reviews.djs?format=embeddedhtml"
    RECOMMENDED_URL = "http://origin.api-beta.homedepot.com/ProductServices/v2/products/" \
        "recommendation?type=json&key=tRXWvUBGuAwEzFHScjLw9ktZ0Bw7a335"

    product_filter = []

    def __init__(self, *args, **kwargs):
        # All this is to set the site_name since we have several
        # allowed_domains.
        self.br = BuyerReviewsBazaarApi()
        super(HomedepotProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    @staticmethod
    def _parse_no_longer_available(response):
        message = response.xpath(
            '//div[@class="error" and '
            'contains(., "The product you are trying to view is not currently available.")]'
        )
        return bool(message)

    def parse_product(self, response):
        product = response.meta['product']
        product['_subitem'] = True

        if self._parse_no_longer_available(response):
            product['no_longer_available'] = True
            return product
        else:
            product['no_longer_available'] = False

        cond_set(
            product, 'title',
            response.xpath(
                "//h1[contains(@class, 'product-title')]/text()").extract())
        brand = response.xpath("//h2[@itemprop='brand']/text()").extract()
        brand = ["".join(brand).strip()]
        cond_set(product, 'brand', brand)

        cond_set(
            product, 'image_url',
            response.xpath("//div[@class='product_mainimg']/img/@src |"
                           "//img[@id='mainImage']/@src").extract())

        cond_set(
            product, 'price',
            response.xpath("//div[@class='pricingReg']"
                           "/span[@itemprop='price']/text()").extract())

        reseller_id_regex = "\/(\d+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        if product.get('price', None):
            if not '$' in product['price']:
                self.log('Unknown currency at' % response.url)
            else:
                product['price'] = Price(price=product['price'].replace(
                    ',', '').replace('$', '').strip(),
                                         priceCurrency='USD')

        if not product.get('price'):
            price = response.xpath(
                "//div[@class='pricingReg']"
                "/span[@itemprop='price']/text() |"
                "//div[contains(@class, 'pricingReg')]/span[@itemprop='price']"
            ).re(FLOATING_POINT_RGEX)
            if price:
                product["price"] = Price(priceCurrency="USD", price=price[0])

        try:
            product['model'] = response.css(
                '.product_details.modelNo ::text').extract()[0].replace(
                    'Model', '').replace('#', '').strip()
        except IndexError:
            pass

        internet_no = response.css('#product_internet_number ::text').extract()
        if internet_no:
            internet_no = internet_no[0]

        upc = is_empty(re.findall("ItemUPC=\'(\d+)\'", response.body))
        if upc:
            product["upc"] = upc

        upc = response.xpath("//upc/text()").re('\d+')
        if upc:
            product["upc"] = upc[0]

        desc = response.xpath("//div[@id='product_description']"
                              "/div[contains(@class,'main_description')]"
                              "/descendant::*[text()]/text()"
                              "//div[contains(@class, 'main_description')] |"
                              "//div[@id='product_description']").extract()
        desc = " ".join(l.strip() for l in desc if len(l.strip()) > 0)
        product['description'] = desc

        product['locale'] = "en-US"

        metadata = response.xpath(
            "//script[contains(text(),'PRODUCT_METADATA_JSON')]"
            "/text()").re('var PRODUCT_METADATA_JSON = (.*);')
        skus = []
        if metadata:
            metadata = metadata[0]
            jsmeta = hjson.loads(metadata)
            try:
                skus = [jsmeta["attributeDefinition"]["defaultSku"]]
                response.meta['skus'] = skus
                metaname = jsmeta['attributeDefinition']['attributeListing'][
                    0]['label']
                response.meta['attributelabel'] = metaname
            except (KeyError, IndexError):
                self.log("Incomplete data from Javascript.", DEBUG)

        certona_payload = self._gen_payload(response)

        if certona_payload:
            new_meta = response.meta.copy()
            new_meta['product'] = product
            new_meta['handle_httpstatus_list'] = [404, 415]
            new_meta['internet_no'] = internet_no
            headers = {
                'Proxy-Connection': 'keep-alive',
                'Content-Type': 'application/json'
            }
            return Request(
                self.RECOMMENDED_URL,
                callback=self._parse_related_products,
                headers=headers,
                body=json.dumps(certona_payload),
                method="POST",
                meta=new_meta,
                priority=1000,
            )

        if internet_no:
            return Request(
                url=self.REVIEWS_URL % internet_no,
                callback=self.parse_buyer_reviews,
                meta={"product": product},
                dont_filter=True,
            )

        return self._gen_variants_requests(response, product, skus,
                                           internet_no)

    def _gen_variants_requests(self, response, product, skus, internet_no):
        reqs = []

        for sku in skus:
            # if sku:
            #     sku = sku[len(sku)-1]
            new_product = product.copy()

            new_meta = response.meta.copy()
            new_meta['product'] = new_product
            new_meta['handle_httpstatus_list'] = [404]
            new_meta['internet_no'] = internet_no
            url = self.DETAILS_URL % sku
            reqs.append(
                Request(url,
                        self._parse_skudetails,
                        meta=new_meta,
                        priority=1000))
        if not reqs:
            return product
        return reqs

    def _gen_payload(self, response):
        """Generates request body. Also maxProducts value can be changed for +\- number of values"""

        # changed version 4.2x -> 5.3x
        # appid = response.xpath("//input[@id='certona_appId']/@value").extract()
        # if not appid:
        #     print "no appid"
        #     return
        appid = 'homedepot01'
        critemid = response.xpath(
            "//input[@id='certona_critemId']/@value").extract()
        if not critemid:
            critemid = is_empty(
                re.findall("\"itemId\"\:\"(\d+)\"", response.body))
        if not critemid:
            return

        payload = {
            "appId": appid,
            "products": critemid,
            "maxProducts": "16",
            "certonaSchema": "PIPHorizontal1_rr",
            "sessionId": "41020192309266",
            "trackingId": "252187705102752",
            "storeId": "123",
        }
        return payload

    def _parse_related_products(self, response):
        product = response.meta['product']
        internet_no = response.meta.get('internet_no', None)

        if response.status in response.meta['handle_httpstatus_list']:
            # No further pages were found. Check the request payload.
            return product

        data = json.loads(response.body_as_unicode())
        related_prods = []
        for prod in data['schemas'][0]['products']:
            name = prod['productName']
            href = prod['canonicalURL']
            related_prods.append(
                RelatedProduct(name, urlparse.urljoin(product['url'], href)))
        if related_prods:
            if 'THE HOME DEPOT RECOMMENDS' in data['schemas'][0]['title']:
                product['related_products'] = {'recommended': related_prods}
            else:
                product['related_products'] = {
                    'buyers_also_bought': related_prods
                }

        skus = response.meta.get('skus', None)

        if not skus:
            if internet_no:
                return Request(
                    url=self.REVIEWS_URL % internet_no,
                    callback=self.parse_buyer_reviews,
                    meta={"product": product},
                    dont_filter=True,
                )
            return product
        return self._gen_variants_requests(response, product, skus,
                                           internet_no)

    def _parse_skudetails(self, response):
        product = response.meta['product']

        try:
            jsdata = json.loads(response.body_as_unicode())
            storeskus = jsdata['storeSkus']
            price = storeskus['storeSku']['pricing']['originalPrice']
            product['price'] = price

            if product.get('price', None):
                if not '$' in product['price']:
                    self.log('Unknown currency at' % response.url)
                else:
                    product['price'] = Price(price=product['price'].replace(
                        ',', '').replace('$', '').strip(),
                                             priceCurrency='USD')

            desc = jsdata['info']['description']
            product['description'] = desc

            url = jsdata['canonicalURL']
            url = urlparse.urljoin(product['url'], url)
            product['url'] = url

            image = jsdata['inlinePlayerJSON']['IMAGE'][1]['mediaUrl']
            product['image_url'] = image

            attrname = response.meta.get('attributelabel', 'Color/Finish')
            colornames = jsdata['attributeGroups']['group'][0]['entries'][
                'attribute']
            colornames = [
                el['value'] for el in colornames if el['name'] == attrname
            ]
            if colornames:
                product['model'] = str(colornames[0])
        except (ValueError, KeyError, IndexError):
            self.log("Failed to parse SKU details.", DEBUG)

        internet_no = response.meta.get('internet_no', None)
        if internet_no:
            return Request(
                url=self.REVIEWS_URL % internet_no,
                callback=self.parse_buyer_reviews,
                meta={"product": product},
                dont_filter=True,
            )

        return product

    def parse_buyer_reviews(self, response):
        product = response.meta.get("product")
        brs = self.br.parse_buyer_reviews_per_page(response)
        self.br.br_count = brs.get('num_of_reviews', None)
        brs['rating_by_star'] = self.br.get_rating_by_star(response)
        product['buyer_reviews'] = brs
        return product

    def _scrape_total_matches(self, response):
        totals = response.xpath("//a[@id='all_products']/label"
                                "/text()").re(r'All Products \(([\d,]+)\)')
        if totals:
            totals = totals[0]
            totals = totals.replace(",", "")
            if is_num(totals):
                return int(totals)
        no_matches = response.xpath(
            "//h1[@class='page-title']/text()").extract()
        if no_matches:
            if 'we could not find any' in no_matches[0] or \
               'we found 0 matches for' in no_matches[0]:
                return 0
        total_matches = response.xpath(
            '//*[contains(@id, "all_products")]//text()').extract()
        if total_matches:
            total_matches = ''.join(total_matches)
            total_matches = ''.join(c for c in total_matches if c.isdigit())
            if total_matches and total_matches.isdigit():
                return int(total_matches)
        total_matches = response.xpath('//div[@id="allProdCount"]/text()').re(
            FLOATING_POINT_RGEX)
        if total_matches:
            total_matches = total_matches[0]
            total_matches = total_matches.replace(',', '')
            if total_matches.isdigit():
                return int(total_matches)
        return

    def _scrape_product_links(self, response):
        links = response.xpath(
            "//div[contains(@class,'product') "
            "and contains(@class,'plp-grid')]"
            "//descendant::a[contains(@class, 'item_description')]/@href | "
            "//div[contains(@class, 'description')]/a[@data-pod-type='pr']/@href"
        ).extract()

        if not links:
            self.log("Found no product links.", DEBUG)

        for link in links:
            if link in self.product_filter:
                continue
            self.product_filter.append(link)
            yield link, SiteProductItem()

    def _scrape_next_results_page_link(self, response):
        next_page = response.xpath(
            "//div[@class='pagination-wrapper']/ul/li/span"
            "/a[@title='Next']/@href |"
            "//div[contains(@class, 'pagination')]/ul/li/span"
            "/a[@class='icon-next']/@href |"
            "//li[contains(@class, 'hd-pagination__item')]"
            "/a[contains(@class, 'pagination__link') and @title='Next']/@href"
        ).extract()
        if next_page:
            return urlparse.urljoin(response.url, next_page[0])
Beispiel #20
0
class PepboysProductsSpider(ProductsSpider):
    name = 'pepboys_products'
    allowed_domains = ['pepboys.com']

    SEARCH_URL = "http://www.pepboys.com/s?query={search_term}"

    BUYER_REVIEWS_URL = ("https://pepboys.ugc.bazaarvoice.com/8514-en_us"
                         "/{product_id}/reviews.djs?format=embeddedhtml")

    def __init__(self, *args, **kwargs):
        super(PepboysProductsSpider, self).__init__(*args, **kwargs)
        self.br = BuyerReviewsBazaarApi(called_class=self)

    def _total_matches_from_html(self, response):
        total = response.css('.resultCount::text').re('of (\d+) Result')
        return int(total[0].replace(',', '')) if total else 0

    def _scrape_results_per_page(self, response):
        return 39

    def _scrape_next_results_page_link(self, response):
        link = response.xpath('//a[@class="next"]/@href').extract()
        return link[0] if link else None

    def _scrape_product_links(self, response):
        item_urls = response.xpath(
            '//*[@class="product"]/a[1]/@href').extract()
        for item_url in item_urls:
            yield item_url, SiteProductItem()

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _parse_title(self, response):
        title = response.xpath(
            '//h4[contains(@class,"margin-top-none")]//text()').extract()
        title = [r.strip() for r in title if len(r.strip()) > 0]
        title = "".join(title)
        return title.strip() if title else None

    def _parse_categories(self, response):
        categories = response.xpath(
            '//*[@class="breadcrumb"]//li/a/text()').extract()
        return categories if categories else None

    def _parse_category(self, response):
        categories = self._parse_categories(response)
        return categories[-1] if categories else None

    def _parse_price(self, response):
        try:
            price = response.xpath(
                '//div[contains(@class,"subtotal")]//span[@class="price"]//text()'
            ).extract()[0].strip()
            price = re.findall(r'[\d\.]+', price)
        except:
            return None
        if not price:
            return None
        return Price(price=price[0], priceCurrency='USD')

    def _parse_image_url(self, response):
        image_url = response.xpath(
            '//img[contains(@class,"tdTireDetailImg")]/@src').extract()
        return image_url[0] if image_url else None

    def _parse_brand(self, response):
        brand = 'Pepboys'
        return brand.strip() if brand else None

    def _parse_sku(self, response):
        sku = response.xpath(
            '//div[contains(@class,"j-results-item-container")]/@data-sku'
        ).extract()
        return sku[0] if sku else None

    def _parse_variants(self, response):
        return None

    def _parse_is_out_of_stock(self, response):
        status = response.xpath(
            '//*[@id="availability"]/span[text()="In Stock"]')

        return not bool(status)

    def _parse_shipping_included(self, response):
        shipping_text = ''.join(
            response.xpath('//span[@class="free-shipping"]//text()').extract())

        return shipping_text == ' & FREE Shipping'

    def _parse_description(self, response):
        description = response.xpath(
            '//div[contains(@class,"tdContentDesc")]').extract()

        return ''.join(description).strip() if description else None

    def _parse_buyer_reviews(self, response):
        str_review = response.xpath(
            '//div[@class="tsrSeeReviews"]//text()').extract()
        if str_review:
            num_of_reviews = re.findall(r'[\d]+', str_review)
            if len(num_of_reviews) > 0:
                num_of_reviews = num_of_reviews[0]

        buyer_reviews = {
            'num_of_reviews': int(num_of_reviews),
            'average_rating': float(average_rating),
            'rating_by_star': rating_by_star
        }

        product = response.meta['product']
        buyer_reviews = self.br.parse_buyer_reviews_per_page(response)
        product['buyer_reviews'] = buyer_reviews
        yield product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """

        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs
        return req.replace(meta=new_meta)

    def parse_product(self, response):
        reqs = []
        product = response.meta['product']
        response.meta['product_response'] = response
        # Set locale
        product['locale'] = 'en_US'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse category
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)

        # Parse categories
        categories = self._parse_categories(response)
        cond_set_value(product, 'categories', categories)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse stock status
        out_of_stock = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', out_of_stock)

        # Sku
        sku = self._parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Brand
        brand = self._parse_brand(response)
        cond_set_value(product, 'brand', brand)

        # Shipping included
        shipping_included = self._parse_shipping_included(response)
        cond_set_value(product, 'shipping_included', shipping_included)

        # Custom reviews
        if sku:
            # Parse buyer reviews
            reqs.append(
                Request(url=self.BUYER_REVIEWS_URL.format(product_id=sku),
                        dont_filter=True,
                        callback=self.br.parse_buyer_reviews))

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _get_products(self, response):
        remaining = response.meta['remaining']
        search_term = response.meta['search_term']
        prods_per_page = response.meta.get('products_per_page')
        total_matches = response.meta.get('total_matches')
        scraped_results_per_page = response.meta.get(
            'scraped_results_per_page')

        prods = self._scrape_product_links(response)

        if prods_per_page is None:
            # Materialize prods to get its size.
            prods = list(prods)
            prods_per_page = len(prods)
            response.meta['products_per_page'] = prods_per_page

        if scraped_results_per_page is None:
            scraped_results_per_page = self._scrape_results_per_page(response)
            if scraped_results_per_page:
                self.log(
                    "Found %s products at the first page" %
                    scraped_results_per_page, INFO)
            else:
                scraped_results_per_page = prods_per_page
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to scrape number of products per page",
                            ERROR)
            response.meta[
                'scraped_results_per_page'] = scraped_results_per_page

        if total_matches is None:
            total_matches = self._scrape_total_matches(response)
            if total_matches is not None:
                response.meta['total_matches'] = total_matches
                self.log("Found %d total matches." % total_matches, INFO)
            else:
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to parse total matches for %s" %
                            response.url, ERROR)

        if total_matches and not prods_per_page:
            # Parsing the page failed. Give up.
            self.log("Failed to get products for %s" % response.url, ERROR)
            return

        for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)):
            # Initialize the product as much as possible.
            prod_item['site'] = self.site_name
            prod_item['search_term'] = search_term
            prod_item['total_matches'] = total_matches
            prod_item['results_per_page'] = prods_per_page
            prod_item['scraped_results_per_page'] = scraped_results_per_page
            # The ranking is the position in this page plus the number of
            # products from other pages.
            prod_item['ranking'] = (i + 1) + (self.quantity - remaining)
            if self.user_agent_key not in ["desktop", "default"]:
                prod_item['is_mobile_agent'] = True

            if prod_url is None:
                # The product is complete, no need for another request.
                yield prod_item
            elif isinstance(prod_url, Request):
                cond_set_value(prod_item, 'url', prod_url.url)  # Tentative.
                yield prod_url
            else:
                # Another request is necessary to complete the product.
                url = urlparse.urljoin(response.url, prod_url)
                cond_set_value(prod_item, 'url', url)  # Tentative.
                yield Request(
                    url,
                    callback=self.parse_product,
                    meta={'product': prod_item},
                    # Remove Referer field on searchs to make the
                    # website display the breadcrumbs
                    headers={'referer': ''},
                )
Beispiel #21
0
class PetcoProductsSpider(ProductsSpider):
    name = 'petco_products'
    allowed_domains = ['petco.com']

    SEARCH_URL = ("http://www.petco.com/shop/SearchDisplay?categoryId=&storeId"
                  "=10151&catalogId=10051&langId=-1&sType=SimpleSearch&"
                  "resultCatEntryType=2&showResultsPage=true&searchSource=Q&"
                  "pageView=&beginIndex=0&pageSize=48&fromPageValue=search"
                  "&searchTerm={search_term}")

    SEARCH_URL_2 = ("http://www.petco.com/shop/ProductListingView?searchType="
                    "12&filterTerm=&langId=-1&advancedSearch=&sType=Simple"
                    "Search&resultCatEntryType=2&catalogId=10051&searchTerm="
                    "{search_term}&resultsPerPage=48&emsName=&facet=&category"
                    "Id=&storeId=10151&beginIndex={begin_index}")

    REVIEW_URL = ("http://api.bazaarvoice.com/data/products.json?"
                  "passkey=dpaqzblnfzrludzy2s7v27ehz&apiversion=5.5"
                  "&filter=id:{product_id}&stats=reviews")

    PRICE_URL = "http://www.petco.com/shop/GetCatalogEntryDetailsByIDView"

    def __init__(self, *args, **kwargs):
        super(PetcoProductsSpider, self).__init__(*args, **kwargs)
        self.br = BuyerReviewsBazaarApi(called_class=self)
        self.product_last_page = 0

    def parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json(
            response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def _total_matches_from_html(self, response):
        total = response.xpath(
            '//*[contains(@id,"searchTotalCount")]/text()').re('\d+')
        return int(total[0].replace(',', '')) if total else 0

    def _scrape_results_per_page(self, response):
        return 48

    def _scrape_next_results_page_link(self, response):
        # End of pagination
        if not self.product_last_page:
            return None

        begin_index = int(re.search('beginIndex=(\d+)', response.url).group(1))
        num_poduct_page = self._scrape_results_per_page(response)
        st = response.meta['search_term']
        url = self.url_formatter.format(
            self.SEARCH_URL_2,
            search_term=urllib.quote_plus(st.encode('utf-8')),
            begin_index=str(begin_index + num_poduct_page))
        return url

    def _scrape_product_links(self, response):
        item_urls = response.xpath(
            '//*[@class="product-display-grid"]'
            '//*[@class="product-name"]/a/@href').extract()

        self.product_last_page = len(item_urls)
        for item_url in item_urls:
            yield item_url, SiteProductItem()

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _parse_title(self, response):
        title = response.xpath('//h1/text()').extract()
        return title[0].strip() if title else None

    def _parse_categories(self, response):
        categories = response.css('.breadcrumb a::text').extract()
        return categories if categories else None

    def _parse_category(self, response):
        categories = self._parse_categories(response)
        return categories[-1] if categories else None

    def _parse_image_url(self, response):
        image_url = response.xpath(
            '//*[@property="og:image"]/@content').extract()
        return image_url[0] if image_url else None

    def _parse_brand(self, response):
        brand = response.xpath('//*[@class="product-brand"]/a/text()').re(
            'by.(.*)')

        return brand[0].strip() if brand else None

    def _parse_sku(self, response):
        sku = response.xpath("//input[@id='primarySku']/@value").extract()
        if len(sku[0]) < 1:
            sku = response.css('.product-sku::text').re(u'SKU:.(\d+)')

        return sku[0] if sku else None

    def _parse_variants(self, response):
        variants = []

        try:
            variants_info = json.loads(
                response.xpath(
                    '//*[contains(@id,"entitledItem_")]/text()').extract()[0])
        except:
            variants_info = {}

        for attr_value in variants_info:
            attributes = {}
            variant_attribute = attr_value["Attributes"]
            attributes['price'] = attr_value["RepeatDeliveryPrice"]["price"]
            attributes['image_url'] = attr_value["ItemImage"]
            if variant_attribute:
                attr_text = attr_value["Attributes"].keys()[0].split('_')
                attributes[attr_text[0]] = attr_text[1]

            variants.append(attributes)

        return variants if variants else None

    def _parse_is_out_of_stock(self, response):
        status = response.xpath(
            '//*[@itemprop="availability" and @content="in_stock"]')
        return not bool(status)

    def _parse_shipping_included(self, response):
        pass

    def _parse_description(self, response):
        description = response.xpath('//*[@id="description"]').extract()

        return ''.join(description).strip() if description else None

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """

        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs
        return req.replace(meta=new_meta)

    def parse_product(self, response):
        reqs = []
        product = response.meta['product']

        # Set locale
        product['locale'] = 'en_US'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse category
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)

        # Parse categories
        categories = self._parse_categories(response)
        cond_set_value(product, 'categories', categories)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse stock status
        out_of_stock = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', out_of_stock)

        # Sku
        sku = self._parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Reseller_id
        cond_set_value(product, 'reseller_id', sku)

        # Brand
        brand = self._parse_brand(response)
        cond_set_value(product, 'brand', brand)

        product_id = response.xpath(
            '//*[@id="productPartNo"]/@value').extract()

        if product_id:
            reqs.append(
                Request(url=self.REVIEW_URL.format(product_id=product_id[0],
                                                   index=0),
                        dont_filter=True,
                        callback=self.parse_buyer_reviews,
                        meta=response.meta))

        price_id = response.xpath('//*[contains(@id,"entitledItem_")]/@id').re(
            'entitledItem_(\d+)')

        cat_id = response.xpath('//script/text()').re(
            'productDisplayJS.displayAttributeInfo\("(\d+)","(\d+)"')

        if not cat_id:
            cat_id = response.xpath(
                '//*[@name="firstAvailableSkuCatentryId_avl"]/@value').extract(
                )

        if price_id and cat_id:
            text = ("storeId=10151&langId=-1&catalogId=10051&"
                    "catalogEntryId={cat}&productId={prod_id}".format(
                        cat=cat_id[0], prod_id=price_id[0]))
            reqs.append(
                Request(self.PRICE_URL,
                        body=text,
                        headers={
                            'Content-Type':
                            'application/x-www-form-urlencoded',
                            'X-Requested-With': 'XMLHttpRequest'
                        },
                        method='POST',
                        meta=response.meta,
                        callback=self._parse_price,
                        dont_filter=True))

        else:
            prices = map(
                float,
                response.xpath('//*[@class="product-price"]//span/text()').re(
                    '\$([\d\.]+)'))
            product['price'] = Price(price=min(prices), priceCurrency="USD")

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_price(self, response):
        reqs = response.meta.get('reqs', [])
        product = response.meta['product']

        raw_information = re.findall('\{.*\}', response.body,
                                     re.MULTILINE | re.DOTALL)[0]

        product_data = eval(raw_information)
        price = product_data["catalogEntry"]["offerPrice"]
        product['price'] = Price(price=price, priceCurrency="USD")

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Beispiel #22
0
class NikeProductSpider(BaseProductsSpider):
    name = 'nike_products'
    allowed_domains = ["nike.com"]

    SEARCH_URL = "http://nike.com/#{search_term}"

    REVIEW_URL = "http://nike.ugc.bazaarvoice.com/9191-en_us/{product_model}" \
                 "/reviews.djs?format=embeddedhtml"

    #handle_httpstatus_list = [404, 403, 429]

    use_proxies = False  # we'll be using Crawlera instead

    def __init__(self, sort_mode=None, *args, **kwargs):
        from scrapy.conf import settings
        settings.overrides['DEPTH_PRIORITY'] = 1
        settings.overrides[
            'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue'
        settings.overrides[
            'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue'
        settings.overrides['CRAWLERA_ENABLED'] = True

        self.quantity = kwargs.get('quantity', 1000)  # default is 1000

        self.proxy = 'content.crawlera.com:8010'
        self.proxy_type = 'http'
        #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
        self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'

        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(NikeProductSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    @staticmethod
    def _get_antiban_headers():
        return {
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0',
            'Connection': 'keep-alive',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate'
        }

    def start_requests(self):
        for st in self.searchterms:
            yield Request(self.url_formatter.format(
                self.SEARCH_URL,
                search_term=st.encode('utf-8'),
            ),
                          meta={
                              'search_term': st,
                              'remaining': self.quantity
                          },
                          headers=self._get_antiban_headers())
        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            meta = {}
            meta['is_product_page'] = True
            meta['product'] = prod
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta=meta,
                          headers=self._get_antiban_headers())

    def _init_firefox(self):
        from selenium import webdriver
        from selenium.webdriver.remote.remote_connection import RemoteConnection
        RemoteConnection.set_timeout(30)
        profile = webdriver.FirefoxProfile()
        profile.set_preference("general.useragent.override", self.user_agent)
        profile.set_preference('intl.accept_languages', 'en-US')
        profile.set_preference("network.proxy.type",
                               1)  # manual proxy configuration
        profile.set_preference('permissions.default.image', 2)
        if self.proxy:
            profile.set_preference("network.http.phishy-userpass-length", 255)
            if 'socks' in self.proxy_type:
                profile.set_preference("network.proxy.socks",
                                       self.proxy.split(':')[0])
                profile.set_preference("network.proxy.socks_port",
                                       int(self.proxy.split(':')[1]))
            else:
                profile.set_preference("network.proxy.http",
                                       self.proxy.split(':')[0])
                profile.set_preference("network.proxy.http_port",
                                       int(self.proxy.split(':')[1]))
        profile.update_preferences()
        driver = webdriver.Firefox(profile)
        driver.set_window_size(1280, 1024)
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)
        return driver

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _get_product_links_from_serp(self, driver):
        results = []
        links = driver.find_elements_by_xpath(
            '//*[contains(@class, "grid-item-image")]'
            '//a[contains(@href, "/pd/") or contains(@href, "/product/")]')

        for l in links:
            href = l.get_attribute('href')
            if href:
                if not href.startswith('http'):
                    href = urlparse.urljoin(
                        'http://' + self.allowed_domains[0], href)
                results.append(href)
        return results

    def _is_product_page(self, response):
        return 'is_product_page' in response.meta

    @staticmethod
    def _get_proxy_ip(driver):
        driver.get('http://icanhazip.com')
        ip = re.search('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
                       driver.page_source)
        if ip:
            ip = ip.group(1)
            return ip

    @staticmethod
    def _auth_firefox_proxy(driver):
        driver.set_page_load_timeout(10)
        try:
            driver.get('http://icanhazip.com')
        except:
            from selenium.webdriver.common.alert import Alert
            time.sleep(3)
            alert = Alert(driver)
            time.sleep(3)
            #alert.authenticate(CRAWLERA_APIKEY, '')
            alert.send_keys(CRAWLERA_APIKEY + '\n')
            alert.accept()
            #alert.send_keys('\t')
            #alert.send_keys('\n')
            #import pdb; pdb.set_trace()
        driver.set_page_load_timeout(30)

    @staticmethod
    def last_five_digits_the_same(lst):
        if len(lst) < 6:
            return
        return lst[-1] == lst[-2] == lst[-3] == lst[-4] == lst[-5]

    def _reliable_get(self,
                      driver,
                      url,
                      max_attempts=40,
                      check_element='title'):
        """ Acts like driver.get() but with failsafety """
        driver.set_page_load_timeout(30)
        driver.set_script_timeout(30)
        for i in range(max_attempts):
            try:
                driver.get(url)
                if driver.find_elements_by_xpath('//%s' % check_element):
                    return driver
            except:
                self.log('_reliable_get error #%i while getting url %s' %
                         (i, url))
        self.log('_reliable_get failed to get url %s' % url, ERROR)

    def parse(self, response):

        if not self._is_product_page(response):
            display = Display(visible=0, size=(1024, 768))
            display.start()

            product_links = []
            # scrape "quantity" products
            driver = self._init_firefox()
            self._auth_firefox_proxy(driver)
            if self.proxy:
                ip_via_proxy = NikeProductSpider._get_proxy_ip(driver)
                print 'IP via proxy:', ip_via_proxy
                self.log('IP via proxy: %s' % ip_via_proxy)
            try:
                self._reliable_get(driver, 'http://store.nike.com/us/en_us')
            except Exception as e:
                print(str(e))
                self.log(str(e), WARNING)
            driver.find_element_by_name('searchList').send_keys(
                self.searchterms[0] + '\n')
            time.sleep(6)  # let AJAX finish
            new_meta = response.meta.copy()
            # get all products we need (scroll down)
            collected_products_len = []
            num_exceptions = 0
            while 1:
                try:
                    driver.execute_script("scrollTo(0,50000)")
                    time.sleep(10)
                    product_links = self._get_product_links_from_serp(driver)
                    collected_products_len.append(len(product_links))
                    print 'Collected %i product links' % len(product_links)
                    self.log('Collected %i product links' % len(product_links))
                    if len(product_links) > self.quantity:
                        break
                    if self.last_five_digits_the_same(collected_products_len):
                        break  # last five iterations collected equal num of products
                except Exception as e:
                    print str(e)
                    self.log('Exception while scrolling page: %s' % str(e),
                             WARNING)
                    num_exceptions += 1
                    if num_exceptions > 10:
                        self.log('Maximum number of exceptions reached', ERROR)
                        driver.quit()
                        display.stop()
                        return

            for i in xrange(10):
                time.sleep(3)
                try:
                    selenium_cookies = driver.get_cookies()
                    break
                except Exception as e:
                    print('Exception while loading cookies %s attempt %i' %
                          (str(e), i))
                    self.log('Exception while loading cookies %s attempt %i' %
                             (str(e), i))
            try:
                driver.quit()
                display.stop()
            except:
                pass
            #driver.save_screenshot('/tmp/1.png')
            new_meta['is_product_page'] = True
            new_meta['proxy'] = self.proxy
            for i, product_link in enumerate(product_links):
                new_meta['_ranking'] = i + 1
                yield Request(product_link,
                              meta=new_meta,
                              callback=self.parse_product,
                              headers=self._get_antiban_headers())
                #cookies=selenium_cookies)

    def parse_product(self, response):
        meta = response.meta.copy()
        product = meta.get('product', SiteProductItem())

        product['_subitem'] = True

        _ranking = response.meta.get('_ranking', None)
        product['ranking'] = _ranking
        product['url'] = response.url
        product['search_term'] = response.meta.get('search_term', None)

        # product data in json
        js_data = self.parse_data(response)

        # product id
        product_id = self.parse_product_id(response, js_data)

        product_color = self.parse_product_color(response, js_data)
        product_price = 0

        # Parse product_id
        title = self.parse_title(response, js_data)
        cond_set_value(product, 'title', title)

        if not product.get('title', None):
            return

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse model
        product_model = self.parse_product_model(response)
        cond_set_value(product, 'model', product_model)

        # Parse image
        image = self.parse_image(response, js_data)
        cond_set_value(product, 'image_url', image)

        # Parse reseller_id
        reseller_id = self.parse_reseller_id(response)
        cond_set_value(product, "reseller_id", reseller_id)

        # Parse brand
        # brand = self.parse_brand(response)
        # cond_set_value(product, 'brand', brand)

        # Parse upc
        # upc = self.parse_upc(response)
        # cond_set_value(product, 'upc', upc)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse description
        description = self.parse_description(response)
        cond_set(product, 'description', description)

        # Parse price
        price = self.parse_price(response, js_data)
        cond_set_value(product, 'price', price)

        # Parse variants
        nv = NikeVariants()
        nv.setupSC(response)
        try:
            product['variants'] = nv._variants()
        except:  # "/product/" urls that are non-standard and not supported (yet)?
            pass
        meta['product'] = product

        # parse buyer reviews
        yield Request(url=self.REVIEW_URL.format(product_model=product_model),
                      dont_filter=True,
                      callback=self.parse_buyer_reviews,
                      meta=meta)
        yield product

    def parse_reseller_id(self, response):
        regex = "\/pid-(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        return reseller_id

    def parse_count_reviews(self, response):
        count_review = response.xpath(
            '//meta[contains(@itemprop, "reviewCount")]/@content').extract()
        if count_review:
            return int(count_review[0])
        else:
            return 0

    def parse_data(self, response):
        script_data = response.xpath(
            '//script[contains(@id, "product-data")]/text()').extract()
        try:
            js_data = json.loads(script_data[0])
            return js_data
        except:
            return

    def parse_image(self, response, js_data):
        if js_data:
            try:
                image = js_data['imagesHeroLarge'][0]
                return image
            except:
                return

    def parse_description(self, response):
        # js_data['content']
        desc = response.xpath(
            '//div[contains(@class, "pi-pdpmainbody")]').extract()

        return desc

    def parse_sku(self, response):
        skuid = response.xpath(
            '//span[contains(@class, "exp-style-color")]/text()').extract()
        if skuid:
            return skuid[0].replace('Style: ', '')

    def parse_price(self, response, js_data):
        if js_data:
            try:
                currency = js_data['crossSellConfiguration']['currency']
            except KeyError:
                currency = "USD"
            try:
                price = js_data['rawPrice']
                self.product_price = price
            except KeyError:
                price = 0.00
            if price and currency:
                price = Price(price=price, priceCurrency=currency)
        else:
            price_og = re.search(
                '<meta property="og:price:amount" content="([\d\.]+)" />',
                response.body_as_unicode())
            if price_og:
                return Price(price=float(price_og.group(1)),
                             priceCurrency="USD")
            price = Price(price=0.00, priceCurrency="USD")
        return price

    def _scrape_total_matches(self, response):
        totals = response.css('.productCount ::text').extract()
        if totals:
            totals = totals[0].replace(',', '').replace('.', '').strip()
            if totals.isdigit():
                if not self.TOTAL_MATCHES:
                    self.TOTAL_MATCHES = int(totals)
                return int(totals)

    def _scrape_product_links(self, response):
        for link in response.xpath(
                '//li[contains(@class, "product-tile")]'
                '//a[contains(@rel, "product")]/@href').extract():
            yield link, SiteProductItem()

    def parse_product_id(self, response, js_data):
        if js_data:
            try:
                product_id = js_data['productId']
                return product_id
            except:
                return

    def parse_product_model(self, response):
        model = response.xpath(
            '//div[contains(@class, "hero-product-style-color-info")]/@data-stylenumber'
        ).extract()
        return model[0] if model else None

    def parse_product_color(self, response, js_data):
        if js_data:
            try:
                product_color = js_data['colorDescription']
                return product_color
            except:
                return

    def parse_title(self, response, js_data):
        if js_data:
            try:
                title = js_data['productTitle']
                return title
            except:
                return

    def parse_buyer_reviews(self, response):
        buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response)
        product = response.meta['product']
        product['buyer_reviews'] = BuyerReviews(**buyer_reviews_per_page)
        yield product
Beispiel #23
0
 def __init__(self, *args, **kwargs):
     super(PetcoProductsSpider, self).__init__(*args, **kwargs)
     self.br = BuyerReviewsBazaarApi(called_class=self)
     self.product_last_page = 0
Beispiel #24
0
class TopshopProductsSpider(ProductsSpider):
    name = 'topshop_products'

    allowed_domains = ['topshop.com']

    SEARCH_URL = "http://us.topshop.com/webapp/wcs/stores/servlet/CatalogNavigationSearchResultCmd?" \
                 "langId=-1&storeId=13052&catalogId=33060&Dy=1&Nty=1&beginIndex=1&pageNum=1&Ntt={search_term}"

    _REVIEWS_URL = 'http://topshop.ugc.bazaarvoice.com/6025-en_us/{sku}/reviews.djs?format=embeddedhtml'

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)
        super(TopshopProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def _total_matches_from_html(self, response):
        total = response.xpath('(//*[@class="pager"]//*[@class="amount"]'
                               '/text())[1]').re('of (\d+)')

        return int(total[0]) if total else 0

    def _scrape_results_per_page(self, response):
        results_per_page = response.xpath(
            '//*[@class="limiter"]//option[@selected]/text()').re('\d+')
        return int(results_per_page[0]) if results_per_page else 0

    def _scrape_next_results_page_link(self, response):
        link = response.xpath('//a[@title="Next"]/@href').extract()
        return link[0] if link else None

    def _scrape_product_links(self, response):
        item_urls = response.xpath(
            '//*[@class="product-name"]/a/@href').extract()
        for item_url in item_urls:
            yield item_url, SiteProductItem()

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _parse_title(self, response):
        title = response.xpath('//*[@itemprop="name"]/text()').extract()
        return title[0] if title else None

    def _parse_category(self, response):
        categories = response.xpath(
            '//*[@id="nav_breadcrumb"]//li//a//span//text()').extract()
        return categories[-1] if categories else None

    def _parse_price(self, response):
        price = response.xpath(
            '//div[contains(@class,"product_details")]'
            '//div[contains(@class,"product_prices")]//span//text()').extract(
            )
        if len(price) > 1:
            price = price[1]
            if "$" in price:
                currency = 'USD'
            else:
                currency = ''

            price = re.findall(r'[\d\.]+', price)
        if len(price) == 0:
            return None

        return Price(price=price[0], priceCurrency=currency)

    def _parse_image_url(self, response):
        image_url = response.xpath(
            '//ul[contains(@class,"product_hero__wrapper")]'
            '//a[contains(@class,"hero_image_link")]//img/@src').extract()
        return image_url[0] if image_url else None

    def _parse_variants(self, response):
        return None

    def _parse_is_out_of_stock(self, response):
        status = response.xpath('//*[@itemprop="availability" '
                                'and not(@href="http://schema.org/InStock")]')
        return bool(status)

    def _parse_description(self, response):
        description = response.xpath(
            '//div[@id="productInfo"]//p//text()').extract()
        return ''.join(description).strip() if description else None

    def _parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page(
            response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def clear_text(self, str_result):
        return str_result.replace("\t", "").replace("\n",
                                                    "").replace("\r",
                                                                "").strip()

    def _parse_buyer_reviews(self, response):
        product = response.meta['product']
        reqs = response.meta.get('reqs', [])

        content = re.search('BVRRRatingSummarySourceID":"(.+?)\},',
                            response._body).group(1).replace('\\"', '"')
        content = content.replace("\\/", "/")
        review_html = html.fromstring(content)

        arr = review_html.xpath(
            '//div[contains(@class,"BVRRQuickTakeSection")]'
            '//div[contains(@class,"BVRRRatingOverall")]'
            '//img[contains(@class,"BVImgOrSprite")]/@title')

        if len(arr) > 0:
            average_rating = float(arr[0].strip().split(" ")[0])
        else:
            average_rating = 0.0

        arr = review_html.xpath(
            '//div[contains(@class,"BVRRReviewDisplayStyle5")]'
            '//div[contains(@class,"BVRRReviewDisplayStyle5Header")]'
            '//span[@itemprop="ratingValue"]//text()')
        num_of_reviews = len(arr)

        review_list = [[5 - i, arr.count(str(5 - i))] for i in range(5)]

        if review_list:
            # average score
            sum = 0
            cnt = 0
            for i, review in review_list:
                sum += review * i
                cnt += review
            # average_rating = float(sum)/cnt
            # number of reviews
            num_of_reviews = 0
            for i, review in review_list:
                num_of_reviews += review
        else:
            pass

        rating_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        for i, review in review_list:
            rating_by_star[i] = review
        if average_rating and num_of_reviews:
            product["buyer_reviews"] = BuyerReviews(
                num_of_reviews=int(num_of_reviews),
                average_rating=float(average_rating),
                rating_by_star=rating_by_star,
            )
        else:
            product["buyer_reviews"] = ZERO_REVIEWS_VALUE

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_last_buyer_date(self, response):
        last_review_date = response.xpath(
            '//*[contains(@class,"box-reviews")]'
            '//*[@class="date"]/text()').re('Posted on (.*)\)')
        return last_review_date[0] if last_review_date else None

    def _parse_sku(self, response):
        sku = response.xpath(
            '//div[@id="productInfo"]//li[contains(@class,"product_code")]//span//text()'
        ).extract()
        return sku[0] if sku else None

    def _parse_brand(self, response, title):
        brand = response.xpath('.//*[contains(text(), "Shop all")]/text()').re(
            r'Shop\sall\s+(\S+)\s?')
        brand = brand[0].strip() if brand else None
        if not brand:
            try:
                brand = guess_brand_from_first_words(title)
            except:
                brand = None
        return brand

    def parse_product(self, response):
        reqs = []
        meta = response.meta.copy()
        product = meta['product']

        # Set locale
        product['locale'] = 'en_US'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse category
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse sku
        sku = self._parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse stock status
        out_of_stock = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', out_of_stock)

        # Parse brand
        brand = self._parse_brand(response, product.get('title'))
        cond_set_value(product, 'brand', brand)

        # Parse last buyer review date
        last_buyer_date = self._parse_last_buyer_date(response)
        cond_set_value(product, 'last_buyer_review_date', last_buyer_date)

        # Parse reviews
        reqs.append(
            Request(url=self._REVIEWS_URL.format(sku=product['sku']),
                    dont_filter=True,
                    callback=self._parse_buyer_reviews,
                    meta=meta))
        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)
Beispiel #25
0
class RiteAidProductsSpider(ProductsSpider):
    name = 'riteaid_products'

    allowed_domains = ['riteaid.com']

    SEARCH_URL = "https://shop.riteaid.com/catalogsearch/result/"\
                 "?limit=72&q={search_term}"

    _REVIEWS_URL='http://api.bazaarvoice.com/data/reviews.json?apiversion=5.5&passkey=tezax0lg4cxakub5hhurfey5o&' \
                 'Filter=ProductId:{sku}&Include=Products&Stats=Reviews'

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)
        super(RiteAidProductsSpider, self).__init__(
            site_name=self.allowed_domains[0], *args, **kwargs)

    def _total_matches_from_html(self, response):
        total = response.xpath(
            '(//*[@class="pager"]//*[@class="amount"]'
            '/text())[1]').re('of (\d+)')

        return int(total[0]) if total else 0

    def _scrape_results_per_page(self, response):
        results_per_page = response.xpath(
            '//*[@class="limiter"]//option[@selected]/text()').re('\d+')
        return int(results_per_page[0]) if results_per_page else 0

    def _scrape_next_results_page_link(self, response):
        link = response.xpath('//a[@title="Next"]/@href').extract()
        return link[0] if link else None

    def _scrape_product_links(self, response):
        item_urls = response.xpath(
            '//*[@class="product-name"]/a/@href').extract()
        for item_url in item_urls:
            yield item_url, SiteProductItem()

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def _parse_title(self, response):
        title = response.xpath('//*[@itemprop="name"]/text()').extract()
        return title[0] if title else None

    def _parse_category(self, response):
        categories = response.xpath(
            '(//a[@property="v:title"]/text())[position()>1]').extract()
        return categories[-1] if categories else None

    def _parse_price(self, response):
        price = response.xpath('//*[@itemprop="price"]/text()').re('[\d\.]+')
        currency = response.xpath(
            '//*[@itemprop="priceCurrency"]/@content').re('\w{2,3}') or ['USD']

        if not price:
            return None

        return Price(price=price[0], priceCurrency=currency[0])

    def _parse_image_url(self, response):
        image_url = response.xpath('//*[@itemprop="image"]/@src').extract()
        return image_url[0] if image_url else None

    def _parse_variants(self, response):
        return None

    def _parse_is_out_of_stock(self, response):
        status = response.xpath(
            '//*[@itemprop="availability" '
            'and not(@href="http://schema.org/InStock")]')
        return bool(status)

    def _parse_description(self, response):
        description = response.xpath(
            '(//*[@id="collateral-tabs"]//*[@class="tab-container"])[1]'
            '//*[self::p or self::ul or self::table] | '
            '(//*[@id="collateral-tabs"]//*[@class="tab-container"])[1]'
            '//*[@class="std"]/text()').extract()
        return ''.join(description).strip() if description else None

    def _parse_buyer_reviews(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page(response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def _parse_buyer_reviews(self, response):
        contents = response.body_as_unicode()
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])
        buyer_reviews = {}
        sku = product.get('sku')
        if not product.get('buyer_reviews'):
            contents = json.loads(contents)
            incl = contents.get('Includes')
            brs = incl.get('Products').get(sku) if incl else None
            if brs:
                by_star = {}
                for d in brs['ReviewStatistics']['RatingDistribution']:
                    by_star[str(d['RatingValue'])] = d['Count']
                for sc in range(1, 6):
                    if str(sc) not in by_star:
                        by_star[str(sc)] = 0
                buyer_reviews['rating_by_star'] = by_star
                review_count = brs['ReviewStatistics']['TotalReviewCount']

                if review_count == 0:
                    product['buyer_reviews'] = ZERO_REVIEWS_VALUE
                    return product

                buyer_reviews['num_of_reviews'] = review_count
                average_review = brs['ReviewStatistics']['AverageOverallRating']
                average_review = float(format(average_review, '.2f'))
                buyer_reviews['average_rating'] = average_review

                product['buyer_reviews'] = BuyerReviews(**buyer_reviews)
            else:
                product['buyer_reviews'] = ZERO_REVIEWS_VALUE

        if not product.get('buyer_reviews'):
            product['buyer_reviews'] = ZERO_REVIEWS_VALUE

        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def _parse_last_buyer_date(self, response):
        last_review_date = response.xpath(
            '//*[contains(@class,"box-reviews")]'
            '//*[@class="date"]/text()').re('Posted on (.*)\)')
        return last_review_date[0] if last_review_date else None

    def _parse_sku(self, response):
        sku = response.xpath('.//*[@itemprop="sku"]/@content').extract()
        return sku[0] if sku else None

    def _parse_brand(self, response, title):
        brand = response.xpath('.//*[contains(text(), "Shop all")]/text()').re(r'Shop\sall\s+(\S+)\s?')
        brand = brand[0].strip() if brand else None
        if not brand:
            brand = guess_brand_from_first_words(title)
        return brand

    def parse_product(self, response):
        reqs = []
        meta = response.meta.copy()
        product = meta['product']

        # Set locale
        product['locale'] = 'en_US'

        # Parse title
        title = self._parse_title(response)
        cond_set_value(product, 'title', title, conv=string.strip)

        # Parse category
        category = self._parse_category(response)
        cond_set_value(product, 'category', category)

        # Parse description
        description = self._parse_description(response)
        cond_set_value(product, 'description', description)

        # Parse price
        price = self._parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse sku
        sku = self._parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse reseller_id
        cond_set_value(product, 'reseller_id', reseller_id)

        # Parse image url
        image_url = self._parse_image_url(response)
        cond_set_value(product, 'image_url', image_url)

        # Parse variants
        variants = self._parse_variants(response)
        cond_set_value(product, 'variants', variants)

        # Parse stock status
        out_of_stock = self._parse_is_out_of_stock(response)
        cond_set_value(product, 'is_out_of_stock', out_of_stock)

        # Parse brand
        brand = self._parse_brand(response, product.get('title'))
        cond_set_value(product, 'brand', brand)

        # Parse last buyer review date
        last_buyer_date = self._parse_last_buyer_date(response)
        cond_set_value(product, 'last_buyer_review_date', last_buyer_date)

        # Parse reviews
        reqs.append(
            Request(
                url=self._REVIEWS_URL.format(sku=product['sku']),
                dont_filter=True,
                callback=self._parse_buyer_reviews,
                meta=meta
            ))
        if reqs:
            return self.send_next_request(reqs, response)

        return product

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs

        return req.replace(meta=new_meta)
Beispiel #26
0
class OrientaltradingProductsSpider(BaseProductsSpider):
    name = 'orientaltrading_products'
    allowed_domains = ['orientaltrading.com', "www.orientaltrading.com"]
    start_urls = []

    SEARCH_URL = "http://www.orientaltrading.com/web/search/searchMain?Ntt={search_term}"

    PAGINATE_URL = "http://www.orientaltrading.com/web/search/searchMain?Nrpp=64&No={nao}&Ntt={search_term}"

    CURRENT_NAO = 0
    PAGINATE_BY = 64  # 64 products
    TOTAL_MATCHES = None  # for pagination

    REVIEW_URL = "http://orientaltrading.ugc.bazaarvoice.com/0713-en_us/{product_id}" \
                 "/reviews.djs?format=embeddedhtml&page={index}&"

    VARIANT_PRODUCT = 'http://www.orientaltrading.com/web/browse/processProductsCatalog'

    #use_proxies = True

    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(OrientaltradingProductsSpider,
              self).__init__(site_name=self.allowed_domains[0],
                             *args,
                             **kwargs)

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def parse_product(self, response):
        meta = response.meta.copy()
        product = meta.get('product', SiteProductItem())
        reqs = []
        meta['reqs'] = reqs

        # Parse locate
        locale = 'en_US'
        cond_set_value(product, 'locale', locale)

        # Parse title
        title = self.parse_title(response)
        cond_set(product, 'title', title)

        # Parse image
        image = self.parse_image(response)
        cond_set(product, 'image_url', image)

        # Parse sku
        sku = self.parse_sku(response)
        cond_set_value(product, 'sku', sku)

        # Parse reseller_id
        cond_set_value(product, "reseller_id", sku)

        # Parse price
        price = self.parse_price(response)
        cond_set_value(product, 'price', price)

        # Parse description
        description = self.parse_description(response)
        cond_set(product, 'description', description)

        product['related_products'] = self.parse_related_product(response)

        otv = OrientaltradingVariants()
        otv.setupSC(response)
        _variants = otv._variants()
        if _variants:
            product['variants'] = _variants

        # reqs = self.parse_variants(response, reqs)

        # Parse reviews
        reqs.append(
            Request(url=self.REVIEW_URL.format(
                product_id=product['sku'].replace('/', '_'), index=0),
                    dont_filter=True,
                    callback=self.parse_buyer_reviews,
                    meta=meta))

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def clear_text(self, str_result):
        return str_result.replace("\t", "").replace("\n", "").replace(
            "\r", "").replace(u'\xa0', ' ').strip()

    def send_next_request(self, reqs, response):
        """
        Helps to handle several requests
        """
        req = reqs.pop(0)
        new_meta = response.meta.copy()
        if reqs:
            new_meta["reqs"] = reqs
        return req.replace(meta=new_meta)

    def parse_related_product(self, response):
        related_prods = []
        urls = response.xpath(
            '//div[contains(@class, "ymal-content-wrapper")]/p/a/@href'
        ).extract()
        titles = response.xpath(
            '//div[contains(@class, "ymal-content-wrapper")]/p/a/text()'
        ).extract()  # Title

        for title, url in zip(titles, urls):
            if url and title:
                related_prods.append(RelatedProduct(title=title, url=url))

        related_products = {}
        if related_prods:
            related_products['you may also like'] = related_prods

        return related_products

    def parse_title(self, response):
        title = response.xpath(
            '//meta[contains(@property, "og:title")]/@content').extract()
        return title

    def parse_image(self, response):
        img = response.xpath(
            '//meta[contains(@property, "og:image")]/@content').extract()
        return img

    def parse_description(self, response):
        description = response.xpath(
            '//div[contains(@class, "pd-text-bloc")] | //p[contains(@class, "pd-text-bloc")]'
        ).extract()
        if description:
            return description
        else:
            return ''

    def parse_sku(self, response):
        sku = response.xpath(
            '//input[contains(@id, "productsku")]/@value').extract()
        if sku:
            return sku[0]

    def parse_productid(self, response):
        model = response.xpath(
            '//input[contains(@id, "productId")]/@value').extract()
        if model:
            return model[0]

    def parse_price(self, response):
        price = response.xpath(
            '//p[contains(@id, "pd-price")]/text()').extract()
        if price:
            price = self.clear_text(price[0].replace('NOW',
                                                     '').replace('$', ''))
            return Price(price=price, priceCurrency="USD")
        else:
            return Price(price=0.00, priceCurrency="USD")

    """
    def parse_variants(self, response, reqs):

        select_variants = response.xpath('//fieldset[contains(@class, "select-options")]/select')
        if select_variants:

            OTC_CSRFTOKEN = response.xpath('//input[contains(@name, "OTC_CSRFTOKEN")]/@value').extract()
            prefix = response.xpath('//input[contains(@id, "prefix")]/@value').extract()
            productId = response.xpath('//input[contains(@id, "productId")]/@value').extract()
            parentSku = response.xpath('//input[contains(@id, "parentSku")]/@value').extract()
            demandPrefix = response.xpath('//input[contains(@id, "demandPrefix")]/@value').extract()
            pznComponentIndex = response.xpath('//input[contains(@id, "pznComponentIndex")]/@value').extract()
            pznHiddenData = response.xpath('//input[contains(@id, "pznHiddenData")]/@value').extract()
            pznImageName = response.xpath('//input[contains(@id, "pznImageName")]/@value').extract()
            destinationDisplayJSP = response.xpath('//input[contains(@name, "destinationDisplayJSP")]/@value').extract()
            requestURI = response.xpath('//input[contains(@name, "requestURI")]/@value').extract()
            numberOfAttributes = response.xpath('//input[contains(@id, "numberOfAttributes")]/@value').extract()
            categoryId = response.xpath('//input[contains(@id, "categoryId")]/@value').extract()
            mode = response.xpath('//input[contains(@id, "mode")]/@value').extract()
            quantity = response.xpath('//input[contains(@name, "quantity")]/@value').extract()

            params = {'OTC_CSRFTOKEN': OTC_CSRFTOKEN[0],
                      'categoryId': categoryId[0],
                      'demandPrefix': demandPrefix[0],
                      'destinationDisplayJSP': destinationDisplayJSP[0],
                      'mode': mode[0],
                      'numberOfAttributes': numberOfAttributes[0],
                      'parentSku': parentSku[0],
                      'prefix': prefix[0],
                      'productId': productId[0],
                      'pznComponentIndex': pznComponentIndex[0],
                      'pznHiddenData': pznHiddenData[0],
                      'pznImageName': pznImageName[0],
                      'quantity': quantity[0],
                      'requestURI': requestURI[0],
                      'sku': '',
                      }

            for v in select_variants:
                name = v.xpath('@name').extract()
                options = v.xpath('option/@value').extract()
                for opt in options:
                    if opt:
                        # TODO: get variant sku for params['sku']
                        # url = 'http://www.orientaltrading.com/rest/ajax/'
                        # post_data = {'formData': "{\"sku\":\"%s\",\"uniqueIdentifier\":\"\",\"nameArray\":[\"%s\"],"
                        #                          "\"valueArray\":[\"%s\"],\"command\":\"AttributeSkuLookup\"}" % (sku, name[0], opt[0]),
                        #              'requestURI': "/"
                        #              }
                        # reqs.append(FormRequest(url=url, formdata=post_data, callback=self.get_sku_attribute))

                        params[name[0]] = opt
                        reqs.append(FormRequest(url=self.VARIANT_PRODUCT,
                                                formdata=params,
                                                callback=self.parse_variants_info))

        return reqs
    """
    """
    def parse_variants_info(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        sku = self.parse_sku(response)
        price = self.parse_price(response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product
    """

    def get_sku_attribute(self, response):
        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        jsondata = json.loads(response.body_as_unicode())

        # {"uniqueIdentifier":"","parentSku":"13578611","attributeSku":"13582742"}
        new_sku = jsondata['attributeSku']

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def parse_buyer_reviews(self, response):

        meta = response.meta.copy()
        product = response.meta['product']
        reqs = meta.get('reqs', [])

        product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page(
            response)

        if reqs:
            return self.send_next_request(reqs, response)
        else:
            return product

    def _scrape_total_matches(self, response):

        data = re.findall(r'site_search_results: "(.+)"',
                          response.body_as_unicode())
        if data:
            totals = data[0]
            if totals.isdigit():
                if not self.TOTAL_MATCHES:
                    self.TOTAL_MATCHES = int(totals)
                return int(totals)
        else:
            return 0

    def _scrape_product_links(self, response):
        for link in response.xpath(
                '//div[contains(@id, "tableSearchResultsPhoto")]/a/@href'
        ).extract():
            yield link, SiteProductItem()

    # def _get_nao(self, url):
    #     nao = re.search(r'pn=(\d+)', url)
    #     if not nao:
    #         return
    #     return int(nao.group(1))
    #
    # def _replace_nao(self, url, new_nao):
    #     current_nao = self._get_nao(url)
    #     if current_nao:
    #         return re.sub(r'nao=\d+', 'pn=' + str(new_nao), url)
    #     else:
    #         return url + '&pn=' + str(new_nao)

    def _scrape_next_results_page_link(self, response):
        if self.TOTAL_MATCHES is None:
            self.log('No "next result page" link!')
            return
        if self.CURRENT_NAO > self.TOTAL_MATCHES + self.PAGINATE_BY:
            return  # it's over
        self.CURRENT_NAO += self.PAGINATE_BY
        return Request(self.PAGINATE_URL.format(
            search_term=response.meta['search_term'],
            nao=str(self.CURRENT_NAO)),
                       callback=self.parse,
                       meta=response.meta)
Beispiel #27
0
 def __init__(self, *args, **kwargs):
     super(ChewyProductsSpider, self).__init__(*args, **kwargs)
     self.br = BuyerReviewsBazaarApi(called_class=self)
Beispiel #28
0
    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)

        super(DebenhamsProductSpider, self).__init__(*args, **kwargs)
Beispiel #29
0
    def __init__(self, *args, **kwargs):
        self.br = BuyerReviewsBazaarApi(called_class=self)
        self.start_index = 0

        super(MicrosoftStoreProductSpider, self).__init__(*args, **kwargs)