Ejemplo n.º 1
0
 def get_asins_generator(self):
     with open(self.file_path) as f:
         reader = csv.DictReader(f)
         for row in reader:
             if row['AMAZON'].strip():
                 yield (AmazonUrlCreator.get_product_asin_from_url(
                     row['AMAZON']), row['PRODUCT_NUMBER'])
    def __parse_product(self, response):
        """
        Parse product just to get seller name
        """
        if self.scraper.antibot_protection_raised(response.body_as_unicode()):
            if self.do_retry:
                yield self.retry_download(url=response.url,
                                          metadata=response.meta,
                                          callback=self.parse_product)
            else:
                self.log('[AMAZON] WARNING: Amazon antibot protection detected, consider using proxy/tor, url: [{}]'.format(
                    response.url))

        product_info = self.scraper.scrape_product_details_page(response)
        if not product_info:
            self.log("[AMAZON] WARNING: no product info: %s" % response.url)
            return
        # Fix self.current_search_item and meta['search_item']. Needed for BSM spider

        hxs = HtmlXPathSelector(response)

        out_of_stock = hxs.select('//div[@class="availRed"]').extract()
        not_available = hxs.select('//span[@class="availOrange"]').extract()
        if out_of_stock or not_available:
            product_info['stock'] = '0'

        sku = hxs.select('//tr[td[text()="Modellnummer"]]/td[@class="value"]/text()').extract()
        if sku:
            product_info['sku'] = sku[0]
        else:
            product_info['sku'] = ''.join(hxs.select('//li[b/text()="Modellnummer:"]/text()').extract()).strip()
            if not product_info['sku']:
                product_info['sku'] = ''.join(hxs.select('//li[contains(b/text(), "Herstellerreferenz")]/text()').extract()).strip()
                if not product_info['sku']:
                    product_info['sku'] = ''.join(hxs.select('//tr[contains(td/text(), "Herstellerreferenz")]/td[@class="value"]/text()').extract()).strip()


        if not product_info['price']:
            product_info['price'] = ''.join(hxs.select('//td/b[@class="priceLarge"]/text()').extract())
            if not product_info['price']:
                product_info['price'] = ''.join(hxs.select('//span[@id="priceblock_ourprice"]/text()').extract())

            product_info['dealer'] = ''.join(hxs.select('//div[@class="buying"]/b/a[contains(@href, "seller")]/text()').extract())
            if not product_info['dealer']:
                product_info['dealer'] = ''.join(hxs.select('//div[@id="merchant-info"]/a[contains(@href, "seller")]/text()').extract())


        if not response.meta.get('search_item'):
            response.meta['search_item'] = product_info
        if not self.current_search_item:
            self.current_search_item = product_info

        if response.meta.get('seller_identifier', None) and not product_info.get('seller_identifier', None):
            product_info['seller_identifier'] = response.meta['seller_identifier']

        check_match = response.meta.get('check_match', True)

        if self.type == 'search':
            match = self.match(response.meta, self.current_search_item, product_info)
        elif self.type == 'category':
            match = True
        elif self.type == 'asins':
            match = True
        else:
            raise CloseSpider("Wrong spider type: %s" % self.type)

        if check_match and not match:
            self.log("[AMAZON] WARNING: product does not match: %s" % response.url)
            return

        if self.parse_options:
            if product_info['options'] and response.meta.get('parse_options', True):
                self.log('[AMAZON] OPTIONS FOUND => %s' % response.url)

                for option in product_info['options']:
                    new_meta = response.meta.copy()
                    new_meta.update({
                        'parse_options': False,
                        'search_string': self.current_search,
                        'search_item': self.current_search_item,
                        'check_match': check_match
                    })
                    yield Request(
                        option['url'],
                        self.parse_product,
                        meta=new_meta
                    )
                return
            else:
                if product_info['option_texts']:
                    product_info['name'] += ' [' + ', '.join(product_info['option_texts']) + ']'

        if self.type == 'asins':
            url_asin = AmazonUrlCreator.get_product_asin_from_url(product_info['url'])
            if product_info['asin'].lower() != url_asin.lower():
                self.log("[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s" %
                         (product_info['asin'], url_asin, response.url))
                return

        # Amazon Direct
        if self.amazon_direct:
            if self.collect_reviews and product_info.get('reviews_url'):
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string': response.meta['search_string'],
                        'search_item': self.current_search_item,
                    })
                yield Request(
                    product_info['reviews_url'],
                    callback=self.parse_reviews,
                    meta=new_meta
                )
            else:
                product = self.construct_product(product_info, meta=response.meta)
                self.log("[AMAZON] collect parse product: %s" % product['identifier'])
                if self.type == 'category':
                    yield product
                else:
                    self._collect_amazon_direct(product, response.meta)
        # Buy Box
        elif self.only_buybox:
            if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                    self.collect_products_with_no_dealer:
                if self.collect_reviews and product_info.get('reviews_url'):
                    new_meta = response.meta.copy()
                    new_meta['found_item'] = product_info
                    if self.type == 'search':
                        new_meta.update({
                            'search_string': response.meta['search_string'],
                            'search_item': self.current_search_item,
                        })
                    yield Request(
                        product_info['reviews_url'],
                        callback=self.parse_reviews,
                        meta=new_meta
                    )
                else:
                    product = self.construct_product(product_info, meta=response.meta)
                    self.log("[AMAZON] collect parse product: %s" % product['identifier'])
                    if self.type == 'category':
                        yield product
                    else:
                        self._collect_buybox(product, response.meta)
            elif not product_info['vendor']:
                # TODO: collect vendor from vendor details page
                self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url)
            else:
                self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url)
        # all sellers / lowest price
        elif self.all_sellers or self.lowest_product_and_seller:
            # Go to MBC lists to get dealers prices
            collect_mbc = response.meta.get('collect_mbc', True)
            if collect_mbc and product_info.get('mbc_list_url_new') and self.collect_new_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string': response.meta['search_string'],
                        'search_item': self.current_search_item,
                    })
                yield Request(
                    product_info['mbc_list_url_new'],
                    callback=self.parse_mbc_list,
                    meta=new_meta
                )
            elif collect_mbc and product_info.get('mbc_list_url_used') and self.collect_used_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string': response.meta['search_string'],
                        'search_item': self.current_search_item,
                    })
                yield Request(
                    product_info['mbc_list_url_used'],
                    callback=self.parse_mbc_list,
                    meta=new_meta
                )
            else:
                if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                        self.collect_products_with_no_dealer:
                    if self.collect_reviews and product_info.get('reviews_url'):
                        new_meta = response.meta.copy()
                        new_meta['found_item'] = product_info
                        if self.type == 'search':
                            new_meta.update({
                                'search_string': response.meta['search_string'],
                                'search_item': self.current_search_item,
                            })
                        yield Request(
                            product_info['reviews_url'],
                            callback=self.parse_reviews,
                            meta=new_meta
                        )
                    else:
                        product = self.construct_product(product_info, meta=response.meta)
                        self.log("[AMAZON] collect parse product: %s" % product['identifier'])
                        if self.type == 'category':
                            yield product
                        else:
                            self._collect(product)
                elif not product_info['vendor']:
                    # TODO: collect vendor from vendor details page
                    self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url)
                else:
                    self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url)
    def __parse_product_list(self, response):
        """
        This function is callback for Scrapy. It processes search results page

        TODO: incorporate cache
        """
        if self.scraper.antibot_protection_raised(response.body_as_unicode()):
            if self.do_retry:
                yield self.retry_download(url=response.url,
                                          metadata=response.meta,
                                          callback=self.parse_product_list)
            else:
                self.log('[AMAZON] WARNING: Amazon antibot protection detected, consider using proxy/tor, url: %s' %
                         response.url)

        follow_suggestions = response.meta.get("follow_suggestions", True)
        is_main_search = response.meta.get("is_main_search", True)

        data = self.scraper.scrape_search_results_page(response)

        if not self.check_number_of_results_fits(data):
            requests = self.get_subrequests_for_search_results(response, data)
            self.log("[AMAZON] WARNING: Number of results is too big (%d). Splitting to %d requests. URL: %s" %
                     (data['results_count'], len(requests), response.url))
            for req in requests:
                yield req
            return

        if data['products']:
            items = data['products']
            found_for = None
            if self.type == 'search':
                found_for = self.current_search
            elif self.type == 'category':
                found_for = self.current_category
            self.log('[AMAZON] Found products for [%s]' % found_for)

        elif data['suggested_products'] and self.try_suggested:
            items = data['suggested_products']
            self.log('[AMAZON] No products found for [%s]. Using suggested products. URL: %s' %
                     (self.current_search, response.url))
        else:
            items = []

        if not items and not response.meta.get('ean_search', False):
            search_string = self.ean_list[self.current_search.replace('"', '')]
            url = AmazonUrlCreator.build_search_url(self.domain, search_string, self.amazon_direct)

            s_item = {
                    'sku': '',
                    'brand': '',
                    'name': '',
                    'category': '',
                    'price': 0,
                    }

            yield Request(url, meta={'search_string': search_string, 'search_item': s_item, 'ean_search':True},
                        dont_filter=True, callback=self.parse_product_list)

        if not data['products'] and follow_suggestions and self.try_suggested:
            self.log('[AMAZON] No products or suggested products found for [%s], trying suggested searches' % self.current_search)
            for url in data['suggested_search_urls']:
                # yield request, should mark that it's referred as suggested search and as so do not check other suggestions
                new_meta = response.meta.copy()
                new_meta.update({
                    'search_string': response.meta['search_string'],
                    'search_item': self.current_search_item,
                    'follow_suggestions': False,
                    'is_main_search': False,
                })
                yield Request(
                    url,
                    meta=new_meta,
                    dont_filter=True,
                    callback=self.parse_product_list
                )

        matched_any = False

        # Amazon Direct
        if self.amazon_direct and not self.only_buybox and not self.all_sellers and not self.lowest_product_and_seller:
            for item in items:
                results = list(self._process_product_list_item_amazon_direct(response, item))
                matched_any = results[-1]
                for req in results[:-1]:
                    yield req
        # Buy-Box
        elif self.only_buybox and not self.amazon_direct and not self.all_sellers and not self.lowest_product_and_seller:
            for item in items:
                results = list(self._process_product_list_item_buybox(response, item))
                matched_any = results[-1]
                for req in results[:-1]:
                    yield req
        # All sellers / lowest price dealer
        elif self.all_sellers or self.lowest_product_and_seller:
            for item in items:
                results = list(self._process_product_list_item_all_sellers(response, item))
                matched_any = results[-1]
                for req in results[:-1]:
                    yield req

        next_url = data['next_url']

        follow_next = False
        if self.type == 'category':
            follow_next = True
        elif self.type == 'search':
            # Follow to next pages only for original search
            # and suggested search if at least one product matched from first page
            # otherwise it tries to crawl the whole Amazon or something like that
            follow_next = (is_main_search or matched_any)
        if next_url and follow_next:
            page = data.get('current_page', 1)
            page = int(page) if page is not None else 1
            if self.max_pages is None or page <= self.max_pages:
                new_meta = response.meta.copy()
                new_meta.update({
                    'follow_suggestions': False,
                    'is_main_search': is_main_search,
                    'current_page': page + 1
                })
                yield Request(
                    next_url,
                    meta=new_meta,
                    dont_filter=True,
                    callback=self.parse_product_list
                )
            else:
                self.log('[AMAZON] Max page limit %d reached. URL: %s' % (self.max_pages, response.url))
        elif next_url:
            self.log('[AMAZON] Not following next page from %s: %s' % (response.url, next_url))
        else:
            self.log('[AMAZON] No next url from %s' % response.url)
Ejemplo n.º 4
0
 def get_url_from_asin(self, asin):
     return AmazonUrlCreator.build_url_from_asin(
         self.domain,
         asin,
     )
Ejemplo n.º 5
0
                    item['price'] = item['price'][0]
                price = extract_price2uk(item['price']) if not isinstance(
                    item['price'], Decimal) else item['price']
            except Exception, e:
                self.log('ERROR: extracting price => PRICE: %s' %
                         repr(item['price']))
                raise e
        else:
            price = Decimal('0')
        price = self.transform_price(price)
        loader.add_value('price', price)

        if item.get('asin') and item.get('seller_identifier'):
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin_and_dealer_id(
                    self.domain, item['asin'], item['seller_identifier']))
        elif item.get('asin'):
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin(self.domain,
                                                     item['asin']))
        elif self.use_amazon_identifier:
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin(self.domain,
                                                     item['identifier']))
        elif item.get('url'):
            loader.add_value('url', item['url'])

        # take sku from model if configured to do so
        if item.get('model') and self.model_as_sku:
Ejemplo n.º 6
0
    def scrape_mbc_list_page(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        try:
            url = hxs.select('//a[@id="olpDetailPageLink"]/@href').extract()[0]
            url = urljoin(base_url, url)
            url_parts = url.split('/')
            try:
                asin = url_parts[url_parts.index('product') + 1]
            except ValueError:
                asin = url_parts[url_parts.index('dp') + 1]
        except IndexError:
            return None

        products = []
        for i, result in enumerate(
                hxs.select(
                    '//div[@id="olpOfferList"]//div[contains(@class, "olpOffer")]'
                ), 1):
            product = {}

            name = ' '.join(
                hxs.select(u'//div[@id="olpProductDetails"]/h1//text()').
                extract()).strip()
            product['name'] = AmazonFilter.filter_name(name)

            brand = hxs.select(
                u'//div[@id="olpProductByline"]/text()').extract()
            if brand:
                product['brand'] = AmazonFilter.filter_brand(brand[0])

            price_el = result.select(
                './/span[contains(@class, "olpOfferPrice")]/text()')
            if not price_el:
                # check if there is text "Add to basket to check price"
                price_text = result.select(
                    './/div[p[contains(@class, "olpShippingInfo")]]/text()'
                ).extract()[0].strip()
                if 'basket' in price_text.lower():
                    product['price'] = None
                else:
                    raise AmazonScraperException(
                        "Couldn't extract price from element %d from url %s" %
                        (i, response.url))
            else:
                price = price_el.extract()[0].strip()
                product['price'] = self._extract_price(response.url, price)

            seller_id = None
            seller_urls = result.select(
                u'.//*[contains(@class, "olpSellerName")]//a/@href').extract()
            if seller_urls:
                seller_url_ = seller_urls[0]
                if 'seller=' in seller_url_:
                    seller_id = url_query_parameter(seller_url_, 'seller')
                else:
                    seller_parts = seller_url_.split('/')
                    try:
                        seller_id = seller_parts[seller_parts.index('shops') +
                                                 1]
                    except (IndexError, KeyError, ValueError):
                        # External website (link "Shop this website"?)
                        seller_id = url_query_parameter(
                            seller_url_, 'merchantID')

            product['identifier'] = asin
            product['asin'] = asin
            if seller_id:
                product['seller_identifier'] = seller_id
                product[
                    'url'] = AmazonUrlCreator.build_url_from_asin_and_dealer_id(
                        AmazonUrlCreator.get_domain_from_url(response.url),
                        asin, seller_id)
                product['seller_url'] = AmazonUrlCreator.build_vendor_url(
                    AmazonUrlCreator.get_domain_from_url(response.url),
                    seller_id)
                # product['url'] = 'http://%s/gp/product/%s/?m=%s' % (self._get_domain_from_url(response.url), product_id, seller_id)
            else:
                product['seller_identifier'] = None
                product['url'] = AmazonUrlCreator.build_url_from_asin(
                    AmazonUrlCreator.get_domain_from_url(response.url), asin)
                product['seller_url'] = None
                # product['url'] = 'http://%s/gp/product/%s/' % (self._get_domain_from_url(response.url), product_id)

            shipping = result.select(
                './/span[@class="olpShippingPrice"]/text()').extract()
            if shipping:
                product['shipping_cost'] = shipping[0]

            image_url = hxs.select(
                u'//div[@id="olpProductImage"]//img/@src').extract()
            if image_url:
                product['image_url'] = urljoin(base_url, image_url[0])

            vendor = result.select(
                u'.//div[contains(@class, "olpSellerColumn")]//img/@title'
            ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//div[contains(@class, "olpSellerColumn")]//img/@alt'
                ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//*[contains(@class, "olpSellerName")]//a/b/text()'
                ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//*[contains(@class, "olpSellerName")]//span/a/text()'
                ).extract()
            if vendor:
                vendor = vendor[0]
                if vendor.lower().startswith('amazon'):
                    vendor = 'Amazon'
                else:
                    vendor = 'AM - ' + vendor
                product['vendor'] = vendor
            elif not seller_id:
                product['vendor'] = 'Amazon'
            else:
                product['vendor'] = None

            stock = result.select(
                './/div[contains(@class,"olpDeliveryColumn")]//text()').re(
                    'En Stock|En stock')
            if stock:
                product['unavailable'] = False

            products.append(product)

        next_url = hxs.select(
            '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href').extract(
            )
        next_url = urljoin(base_url, next_url[0]) if next_url else None

        current_page = hxs.select(
            '//ul[@class="a-pagination"]/li[@class="a-selected"]/a/text()'
        ).extract()
        current_page = current_page[0] if current_page else None

        return {
            'next_url': next_url,
            'current_page': current_page,
            'products': products
        }