Beispiel #1
0
    def parse_page(self, response):
        base_url = get_base_url(response)

        hxs = HtmlXPathSelector(response)
        cats = hxs.select("//ul[@id='nav']//a/@href").extract()
        for url in cats:
            url_ = add_or_replace_parameter(urljoin_rfc(base_url, url), 'pagingSize', '10000')
            yield Request(url_, callback=self.parse_page)

        # next page
        hxs = HtmlXPathSelector(response)
        url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract()
        if url:
            url_ = add_or_replace_parameter(urljoin_rfc(base_url, url[0]), 'pagingSize', '10000')
            yield Request(url_, callback=self.parse_page)

        # products
        for z in hxs.select("//div[@class='products']//li"):
            # name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract()
            loader = ProductLoader(selector=z, item=Product())
            loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_value('url', urljoin_rfc(base_url, z.select("@data-product-url").extract()[0].strip()))
            loader.add_xpath('name', ".//*[contains(@class, 'name')]/text()")
            loader.add_xpath('brand', ".//*[contains(@class, 'brand')]/text()")
            price = z.select(".//p[@class='price']/ins//text()") \
                    or z.select(".//p[@class='price']//text()") \
                    or z.select(".//p[@class='price']/del//text()")

            price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '')
            loader.add_value('price', price)

            yield loader.load_item()
Beispiel #2
0
    def parse_sellers(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']
        products = []  # Only is reviews enabled
        sellers = hxs.select(
            '//div[contains(@class, "merchant") and contains(@class, "product")]'
        )
        for seller in sellers:
            price = seller.select(
                './/span[@class="currentPrice"]/ins/text()').extract()[0]
            seller_name = seller.select(
                './/p[@class="soldby"]/strong//text()').extract()
            try:
                shipping_cost = seller.select(
                    './/div[@class="productPrices"]//span/text()').re(
                        r'\+ ([\d,.]+)')[0]
            except:
                shipping_cost = '0,00'

            stock = seller.select(
                './/p[@class="availability"]/span[contains(@class, "available")]'
                '/i[@class="icon-ok"]').extract()

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier',
                        product['identifier'] + '-' + seller_name[0])
            if self.use_main_id_as_sku:
                l.add_value('sku', product['identifier'])
            l.add_value('name', product['brand'] + ' ' + product['name'])
            l.add_value('category', product['category'])
            l.add_value('brand', product['brand'])
            l.add_value('url', product['url'])
            l.add_value('shipping_cost', self._encode_price(shipping_cost))
            l.add_value('price', self._encode_price(price))
            l.add_value('image_url', product['image_url'])
            l.add_value(
                'dealer',
                'Pix - ' + seller_name[0] if seller_name else 'Pixmania.com')
            if not stock:
                l.add_value('stock', 0)

            new_item = l.load_item()

            if 'metadata' in product:
                new_item['metadata'] = product['metadata'].copy()

            products.append(new_item)

        if self.collect_reviews:
            reviews_url = add_or_replace_parameter(
                self.reviews_url, 'filter.q0',
                'productid:eq:%s' % product['identifier'])
            reviews_url = add_or_replace_parameter(reviews_url, 'offset.q0',
                                                   '0')
            yield Request(reviews_url,
                          meta={'products': products},
                          callback=self.parse_reviews)
        else:
            for item in products:
                yield item
Beispiel #3
0
 def parse_stock(self, response):
     data = json.loads(response.body)
     item = response.meta.get('item')
     options = [
         option for option in data['stocks'] if option['name'] == 'Colour'
     ]
     for option in options:
         p = copy.deepcopy(item)
         p['identifier'] += u'-{}'.format(option['sku'])
         p['sku'] = option['sku']
         if not option['inStock']:
             p['stock'] = 0
         p['name'] += ' {}'.format(option['value'])
         yield p
     if not options:
         size_opts = [
             option for option in data['stocks'] if option['name'] == 'Size'
         ]
         if size_opts and not response.meta.get('size_parsed'):
             size = size_opts[0]['value']
             stock_url = add_or_replace_parameter(
                 self.stock_url.format(item['identifier']), 'attr', 'Size')
             stock_url = add_or_replace_parameter(stock_url, 'attrval',
                                                  size)
             yield Request(stock_url,
                           meta={
                               'item': item,
                               'size_parsed': True
                           },
                           callback=self.parse_stock)
         else:
             yield item
Beispiel #4
0
 def parse_options(self, response):
     product = response.meta['product']
     data = json.loads(response.body)
     total_attributes = int(data['total_attributes'])
     if total_attributes > 0:
         if total_attributes == 1:
             attribute = data['attributes'][0]
             attribute_id = str(attribute['id'])
             for value in attribute['values']:
                 url = add_or_replace_parameter(
                     response.meta['url'],
                     'attributes[' + attribute_id + ']',
                     str(value['value_id']))
                 yield Request(url,
                               callback=self.parse_selection,
                               meta={'product': product})
         elif total_attributes == 2:
             attribute = data['attributes'][0]
             attribute_id = str(attribute['id'])
             for value in attribute['values']:
                 url = add_or_replace_parameter(
                     response.meta['url'],
                     'attributes[' + attribute_id + ']',
                     str(value['value_id']))
                 yield Request(url,
                               callback=self.parse_options2,
                               meta={'product': product})
     else:
         yield product
Beispiel #5
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        # categories
        for url in hxs.select('//div[@id="top-menu"]//a/@href').extract():
            url = urljoin_rfc(base_url, url)
            url = add_or_replace_parameter(url, 'pageSize', '96')
            yield Request(url)

        # sub-categories
        for url in hxs.select('//div[@class="category-title"]/a/@href').extract():
            url = urljoin_rfc(base_url, url)
            url = add_or_replace_parameter(url, 'pageSize', '96')
            yield Request(url)

        # pages
        for url in hxs.select('//div[@class="pager"]//a/@href').extract():
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        products = [urljoin_rfc(base_url, url) for url in
                    hxs.select('//article[contains(@class, "product-grid-item")]//div[@class="product-name"]/a/@href').extract()]
        for url in products:
            yield Request(url, callback=self.parse_product)
Beispiel #6
0
 def parse_products_list(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     show_all = hxs.select('//form[@class="showall"]')
     if show_all:
         show_all = show_all[0]
         url = show_all.select('./@action').extract()[0]
         id_category = show_all.select(
             './/input[@name="id_category"]/@value').extract()[0]
         n = show_all.select('.//input[@name="n"]/@value').extract()[0]
         url = add_or_replace_parameter(url, 'id_category', id_category)
         url = add_or_replace_parameter(url, 'n', n)
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_products_list,
                       cookies={},
                       meta={'dont_merge_cookies': True})
     else:
         urls = hxs.select('//*[@id="pagination"]//a/@href').extract()
         for url in urls:
             yield Request(urljoin_rfc(base_url, url),
                           callback=self.parse_products_list,
                           cookies={},
                           meta={'dont_merge_cookies': True})
         urls = hxs.select(
             '//ul[@class="product_list grid row"]//a[@class="product-name"]/@href'
         ).extract()
         for url in urls:
             yield Request(urljoin_rfc(base_url, url),
                           callback=self.parse_product,
                           cookies={},
                           meta={'dont_merge_cookies': True})
Beispiel #7
0
    def parse_products_list(self, response):
        products = response.xpath('//div[contains(@class, "card--product")]')
        for product in products:
            presc = ' '.join(product.xpath('.//div[@class="links_widget"]/p/a/span/text()').extract())
            if 'I Have a Private Prescription' in presc or 'I Need a Private Prescription' in presc or 'I Have an NHS Prescription' in presc:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            name = product.xpath('.//h2/a/text()').extract()[0]
            loader.add_value('name', name)
            url = product.xpath('.//h2/a/@href').extract()[0]
            loader.add_value('url', url)
            identifier = product.xpath('.//div/button/@data-product-id').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            price = product.xpath('.//span[@class="special-price"]/span[@class="price"]/text()').extract()
            if not price:
                price = product.xpath('.//span[@class="regular-price"]/span[@class="price"]/text()').extract()
            price = extract_price(price[0])
            loader.add_value('price', price)
            category = response.xpath('//nav[@class="breadcrumb"]//li/span/text()').extract()
            category = category[-1] if category else ''
            loader.add_value('category', category)
            if price < 40:
                loader.add_value('shipping_cost', 3.19)
            image_url = product.xpath('.//img[contains(@id, "product-collection-image")]/@src').extract()
            image_url = response.urljoin(image_url[0]) if image_url else ''
            loader.add_value('image_url', image_url)
            yield loader.load_item()

        url_list = products.xpath('.//h2/a/@href').extract()
        if products and url_list != response.meta.get('url_list', []):
            current_page = url_query_parameter(response.url,'p', '1')
            next_url = add_or_replace_parameter(response.url, 'infinitescroll', '1')
            next_url = add_or_replace_parameter(next_url, 'p', str(int(current_page)+1))
            yield Request(next_url, callback=self.parse_products_list, meta={'url_list': url_list})
Beispiel #8
0
 def parse_categories_products(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     # categories
     for url in hxs.select(
             '//ul[@class="products-grid"]/li/a/@href').extract():
         yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url),
                                                'limit', 'all'),
                       callback=self.parse_categories_products)
     for url in hxs.select(
             '//ul[@class="products-grid"]//a[@class="subcategory-thumbnails-list-element-link"]/@href'
     ).extract():
         yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url),
                                                'limit', 'all'),
                       callback=self.parse_categories_products)
     for url in hxs.select(
             '//section//div[@class="editable-type"]/a/@href').extract():
         yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url),
                                                'limit', 'all'),
                       callback=self.parse_categories_products)
     for url in hxs.select(
             '//div[contains(@class, "editable-size")]//a/@href').extract():
         yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url),
                                                'limit', 'all'),
                       callback=self.parse_categories_products)
     # products
     for url in hxs.select(
             '//h2[@class="product-name"]/a/@href|//a[@class="fmore"]/@href'
     ).extract():
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_product)
Beispiel #9
0
    def parse_subcats_full(self, response):
        if 'error.xhtml' in response.url:
            retry_no = int(response.meta.get('retry_no', 0))
            if retry_no < 10:
                for url in response.meta['redirect_urls']:
                    meta = response.meta.copy()
                    meta['retry_no'] = retry_no + 1
                    yield Request(url, dont_filter=True, meta=meta, callback=self.parse_subcats_full)
                return

        hxs = HtmlXPathSelector(response)

        subcats = hxs.select('//div[@id="categories"]//a/@href').extract()
        for cat in subcats:
            url = urljoin_rfc(get_base_url(response), cat)
            if (not '?' in url) and (not url.endswith('/')):
                url += '/'
            url = add_or_replace_parameter(url, 'sort-by', 'P_manufacturerPartNumber')
            url = add_or_replace_parameter(url, 'sort-order', 'asc')
            url = add_or_replace_parameter(url, 'view-type', 'List')
            url = add_or_replace_parameter(url, 'sort-option', 'Manufacturers+Part+Number')
            yield Request(url, callback=self.parse_subcats_full)

        pages = hxs.select('//div[@class="checkoutPaginationContent"]//noscript/a/@href').extract()
        for url in pages:
            yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_subcats_full)

        for product in self.parse_product_list(response):
            yield product
Beispiel #10
0
 def parse(self, response):
     links = response.xpath('//a/@href[contains(., "replica")]').extract()
     for url in links:
         url = response.urljoin(url.strip())
         url = add_or_replace_parameter(url, 'cur', 'USD')
         url = add_or_replace_parameter(url, 'p', '1')
         url = add_or_replace_parameter(url, 'pp', '96')
         yield Request(url, callback=self.parse_list)
Beispiel #11
0
    def parse(self, response):
        url = add_or_replace_parameter(self.base_url, 'pageurl', response.url)
        url = add_or_replace_parameter(url, 'zone0', 'search')
        url = add_or_replace_parameter(url, 'searchoperator',
                                       response.meta['operator'])

        yield Request(url,
                      meta={'url': response.url},
                      callback=self.parse_list)
Beispiel #12
0
 def parse(self, response):
     categories = response.xpath('//div[@id="categoryFilterPanel"]//a')
     for cat in categories:
         cat_name = cat.xpath('span/text()').extract()[0]
         cat_id = cat.xpath('@data-bind').re(r'\d+')[0]
         cat_url = add_or_replace_parameter(response.url, 'cat', str(cat_id))
         cat_url = add_or_replace_parameter(cat_url, 'p', '1')
         yield Request(cat_url, callback=self.parse_category,
                       meta={'category': cat_name, 'page': 1})
Beispiel #13
0
 def parse(self, response):
     links = response.xpath('//ul[@id="primary-nav"]/li[contains(a/text(), '
                            '"Replica")]//a/@href').extract()
     links += response.css('figure.box a::attr(href)').extract()
     for url in links:
         url = response.urljoin(url)
         url = add_or_replace_parameter(url, 'cur', 'GBP')
         url = add_or_replace_parameter(url, 'p', '1')
         url = add_or_replace_parameter(url, 'pp', '96')
         yield Request(url, callback=self.parse_list)
Beispiel #14
0
 def parse_price_ranges(self, response):
     ranges = response.xpath(
         '//*[@id="idevfacet_SalePrice"]//a/@href').extract()
     for url in ranges:
         url = response.urljoin(url)
         url = add_or_replace_parameter(url, 'perpage', '300')
         url = add_or_replace_parameter(url, 'F_Sort', '1')
         yield Request(url,
                       callback=self.parse_product_list,
                       meta={'dont_merge_cookies': True})
Beispiel #15
0
 def parse_brands(self, response):
     base_url = get_base_url(response)
     brands = response.xpath(
         '//div[contains(., "Select a brand")]/select[@id]/option/@value'
     ).extract()
     for url in brands:
         url = urljoin_rfc(base_url, url)
         url = urljoin_rfc(base_url, url) + '?items_per_page=500'
         url = add_or_replace_parameter(url, 'show_out_of_stock', '1')
         url = add_or_replace_parameter(url, 'currency', 'GBP')
         yield Request(url, callback=self.parse_products)
Beispiel #16
0
 def parse_product(self, response):
     item_id = response.xpath(
         '//*[@id="item_details_item_id"]/@value').extract_first()
     image_url = response.xpath('//*[@id="imageMain"]/@src').extract_first()
     category = response.xpath(
         '//div[@class="ws-breadcrumb shared-width"]//a/text()').extract()
     url = 'http://www.doorsworld.co/ajax.get_exact_product.php?instart_disable_injection=true'
     url = add_or_replace_parameter(url, 'item_id', item_id)
     options_containers = response.xpath(
         '//div[@class="option_container clearfix"]//select')
     if options_containers:
         combined_options = []
         for options_container in options_containers:
             element_options = []
             for option in options_container.xpath('./option[@value!=""]'):
                 option_id = options_container.xpath(
                     './@id').extract_first().rsplit('_', 1)[0]
                 option_name = option.xpath('./@value').extract_first()
                 element_options.append((option_id, option_name))
             combined_options.append(element_options)
         if len(options_containers) > 1:
             combined_options = list(itertools.product(*combined_options))
             for combined_option in combined_options:
                 option_url = url
                 for option in combined_option:
                     option_url = add_or_replace_parameter(
                         option_url, 'attributes[{}]'.format(option[0]),
                         option[1])
                 yield scrapy.Request(option_url,
                                      callback=self.parse_product_data,
                                      meta={
                                          'image_url': image_url,
                                          'url': response.url,
                                          'category': category
                                      })
         else:
             for option in combined_options[0]:
                 option_url = add_or_replace_parameter(
                     url, 'attributes[{}]'.format(option[0]), option[1])
                 yield scrapy.Request(option_url,
                                      callback=self.parse_product_data,
                                      meta={
                                          'image_url': image_url,
                                          'url': response.url,
                                          'category': category
                                      })
     else:
         yield scrapy.Request(url,
                              callback=self.parse_product_data,
                              meta={
                                  'image_url': image_url,
                                  'url': response.url,
                                  'category': category
                              })
Beispiel #17
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        # categories
        for url in hxs.select(
                '//div[@class="submenulist"]/div/a/@href').extract():
            #hxs.select("//map[@name='m_entree_secteur_maison' or @name='m_entree_secteur_outil']/area/@href").extract():
            url = urljoin_rfc(get_base_url(response), url)
            url = add_or_replace_parameter(url, 'sort', 'Title-asc')
            yield Request(url,
                          callback=self.parse,
                          meta=response.meta,
                          errback=lambda failure, url=url, meta=response.meta:
                          self.retry_download(failure, url, meta))

        # subcategories
        for url in hxs.select('//li[@class="open "]/ul/li/a/@href').extract():
            url = urljoin_rfc(get_base_url(response), url)
            url = add_or_replace_parameter(url, 'sort', 'Title-asc')
            yield Request(add_or_replace_parameter(url, 'perPage', '100'),
                          callback=self.parse,
                          meta=response.meta,
                          errback=lambda failure, url=url, meta=response.meta:
                          self.retry_download(failure, url, meta))

        # More categories
        for url in hxs.select(
                '//div[@class="arealist"]//li/a/@href').extract():
            url = urljoin_rfc(get_base_url(response), url)
            url = add_or_replace_parameter(url, 'sort', 'Title-asc')
            yield Request(url,
                          callback=self.parse,
                          meta=response.meta,
                          errback=lambda failure, url=url, meta=response.meta:
                          self.retry_download(failure, url, meta))

        # products
        for product in self.parse_product_list(response):
            yield product

        next = hxs.select(
            '//div[@class="page-navigation"]/a[text()="suivant "]/@href'
        ).extract()
        if next:
            url = urljoin_rfc(get_base_url(response), next[0])
            yield Request(url,
                          callback=self.parse,
                          meta=response.meta,
                          errback=lambda failure, url=url, meta=response.meta:
                          self.retry_download(failure, url, meta))
Beispiel #18
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     product = response.meta['product']
     stock = hxs.select('//span[@class="isinstock"]/text()').re(r"(\d+)")
     if stock:
         product['stock'] = int(stock[0])
     url = '/on/demandware.store/Sites-alexanderandjamesEU-Site/de_DE/Cart-MiniAddProduct'
     url = urljoin_rfc(base_url, url)
     url = add_or_replace_parameter(url, 'pid', str(product['identifier']))
     url = add_or_replace_parameter(url, 'Quantity', '1')
     yield Request(url,
                   dont_filter=True,
                   meta={'product': product, 'cookiejar': response.meta['cookiejar']},
                   callback=self.parse_shipping_price1)
Beispiel #19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        categories = hxs.select(
            "//div[@id='dj_equipment_landing_left']"
            "//div[@class='dj_equipment_navigation_contents']"
            "//div[@class='dj_equipment_navigation_records_text']"
            "/a/@href").extract()

        for url in categories:
            url = urljoin_rfc(base_url, url)
            url = urljoin_rfc(base_url, url) + '?items_per_page=500'
            url = add_or_replace_parameter(url, 'show_out_of_stock', '1')
            url = add_or_replace_parameter(url, 'currency', 'GBP')
            yield Request(url, callback=self.parse_products)
Beispiel #20
0
    def parse_options(self, response):
        product = response.meta['product']
        options_found = 0
        try:
            ajax_url = response.meta['options_url']
            data = json.loads(response.body)
            options = iter_product(
                *(map(lambda d: dict(attr_id=attr['id'], **d), attr['values'])
                  for attr in data.get('attributes', [])
                  if not attr['disabled']))
            for options_selected in options:
                new_product = Product(product)
                for option in options_selected:
                    options_found += 1
                    opt_id = 'attributes[%s]' % option['attr_id']
                    opt_value_id = option['value_id']
                    # new_product['identifier'] += ':' + opt_value_id
                    new_product['name'] += ' ' + option['value']
                    ajax_url = add_or_replace_parameter(
                        ajax_url, opt_id, opt_value_id)
                meta = response.meta.copy()
                meta['product'] = new_product

                yield Request(ajax_url,
                              callback=self.parse_options_prices,
                              meta=meta)
        except Exception, e:
            self.log('NO OPTIONS WARNING => %r' % e)
            yield product
Beispiel #21
0
    def parse_subcategories(self, response):
        subcategories = response.xpath("//ul[@class='grid-family-list']//div[@class='img-product']/a/@href").extract()
        if not subcategories:
            subcategories = response.xpath("//ul[@class='grid-family-list']/li/a/@href").extract()

        for subcategory in subcategories:
            yield Request(response.urljoin(subcategory),
                          callback=self.parse_subcategories)

        next_page = response.xpath("//li[@class='sprite bt-next']/a/@val").extract()
        if next_page:
            yield Request(add_or_replace_parameter(response.url, 'page', next_page[0]), callback=self.parse_subcategories)
            subcategory_id = response.meta.get('subcategory_id') or response.url.split('?')[0].split('-')[-1]
            yield FormRequest(
                url='http://www.monechelle.fr/catalog/category/ajaxproducts',
                formdata={
                    'subcategoryid': subcategory_id,
                    'page': next_page[0]},
                meta={'subcategory_id': subcategory_id,
                      'dont_retry':True},
                callback=self.parse_subcategories
            )

        products = response.xpath('//li[contains(@class, "product-card")]//a[@title]/@href').extract()
        for product_url in products:
            yield Request(response.urljoin(product_url),
                          callback=self.parse_product)
Beispiel #22
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select('//ul[@class="catblocks"]//a/@href').extract()
        for url in categories:
            yield Request(urljoin_rfc(base_url, url))

        if not categories:
            new_url = add_or_replace_parameter(response.url, 'limit', '25')
            new_url = add_or_replace_parameter(new_url, 'mode', 'list')
            self._current_cookie += 1
            yield Request(new_url,
                          dont_filter=True,
                          callback=self.parse_list,
                          meta={'cookiejar': self._current_cookie})
Beispiel #23
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        urls = [
            url for url in hxs.select('//*[@id="anav"]//li/a/@href').extract()
            if url not in ('http://www.wedo-beds.co.uk/checkout/cart/', )
        ]
        urls.append('http://www.wedo-beds.co.uk/beds/types.html')
        urls.append('http://www.wedo-beds.co.uk/beds/size.html')
        urls.append('http://www.wedo-beds.co.uk/mattresses/sizes.html')
        urls.append('http://www.wedo-beds.co.uk/mattresses/types.html')
        urls.append('http://www.wedo-beds.co.uk/mattresses/firmness.html')
        urls.append('http://www.wedo-beds.co.uk/mattresses/brands.html')
        urls.append('http://www.wedo-beds.co.uk/headboards/sizes.html')
        urls.append('http://www.wedo-beds.co.uk/headboards/material.html')
        urls.append('http://www.wedo-beds.co.uk/headboards/styles.html')
        urls.append('http://www.wedo-beds.co.uk/headboards/types.html')

        # menu
        for url in urls:
            yield Request(
                add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit',
                                         'all'),
                self.parse_categories_products)
Beispiel #24
0
    def parse(self, response):

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last, format_ = get_last_file('BI BRA File', files)

        file_path = os.path.join(HERE, 'biw_products.csv')
        if format_ == 'csv':
            sftp.get(last.filename, file_path)
        else:
            file_path_excel = os.path.join(HERE, 'biw_products.xlsx')
            sftp.get(last.filename, file_path_excel)
            excel_to_csv(file_path_excel, file_path)

        with open(file_path) as f:
            reader = csv.DictReader(f, delimiter=',')
            for row in reader:
                try:
                    brand = unicode(row['BI Brand'], errors='ignore').strip()
                    if brand not in self.brands:
                        self.brands.append(brand)
                except:
                    pass

        url = 'http://busca.submarino.com.br/busca.php?results_per_page=90'
        for brand in self.brands:
            url = add_or_replace_parameter(url, 'q', brand)
            yield Request(url, callback=self.parse_products_list)
Beispiel #25
0
    def parse_products_list(self, response):
        if not isinstance(response, HtmlResponse):
            return

        base_url = get_base_url(response)

        for link in response.xpath(
                '//ul[@class="nav-main-list-lvl-2"]//a/@href').extract():
            url = urljoin_rfc(base_url, link)
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_products_list)

        for link in response.xpath(
                '//div[@class="box-product-list-item-default js-product-link-parent"]/span/a/@href'
        ).extract():
            url = urljoin_rfc(base_url, link)
            url = add_or_replace_parameter(url, 'pitems', '50')
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product)

        for link in response.xpath(
                '//div[@class="col-md-10 box-title-pager hidden-print"]'
                '//ul[@class="list-inline list-pager"]//a/@href').extract():
            url = urljoin_rfc(base_url, link)
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_products_list)
Beispiel #26
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        brand_urls = hxs.select('//div[@id="rightcolumn"]//a/@href').extract()

        categories = hxs.select(
            '//*[@id="nav-container"]/ul/li/a/@href').extract()
        for category in categories:
            url = urljoin_rfc(get_base_url(response), category)
            yield Request(add_or_replace_parameter(url, 'curr', 'NOK'),
                          callback=self.parse_subcategories)

        for brand_url in brand_urls:
            yield Request(add_or_replace_parameter(
                urljoin_rfc(base_url, brand_url), 'curr', 'NOK'),
                          callback=self.parse_categories)
Beispiel #27
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        data = demjson.decode(response.body)

        product = None
        for product in data['itemList']:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            image_url = '//d39rqydp4iuyht.cloudfront.net/store/product/image/{}.gif'.format(product['id'])
            product_identifier = product['id']
            product_name = product['name']
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            product_loader.add_value('image_url', image_url)
            price = product['minPrice']
            sku = ''
            for match in re.finditer(r"([\d,\.]+)", product_name):
                if len(match.group()) > len(sku):
                    sku = match.group()
            product_loader.add_value('sku', sku)
            product_loader.add_value('price', price)
            url = '/store/ck/item/' + str(product['id'])
            product_loader.add_value('url', urljoin_rfc(base_url, url))
            yield product_loader.load_item()

        if product and product['dataPosition'] < data['numItems']:
            page = int(url_query_parameter(response.url, 'page')) + 1
            url = add_or_replace_parameter(response.url, 'page', str(page))
            yield Request(url)
Beispiel #28
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            products = hxs.select(
                '//div[@class="product_name"]/a/@href').extract()
            for product in products:
                yield Request(urljoin_rfc(get_base_url(response), product),
                              callback=self.parse_product,
                              meta=response.meta)

            if len(products) >= 500:
                index = int(url_query_parameter(response.url, 'beginIndex', 0))
                url = add_or_replace_parameter(response.url, 'beginIndex',
                                               str(index + 500))
                yield Request(url,
                              callback=self.parse_products,
                              meta=response.meta)

        except:
            log.msg('PAGE ERROR >>>')
            log.msg(str(response.body))
            retry = response.meta.get('retry', 0) + 1
            if retry <= 7:
                log.msg('Retry: ' + response.url)
                time.sleep(5)
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_products,
                              meta={'retry': retry})
Beispiel #29
0
    def parse_result(self, response):
        base_url = get_base_url(response)

        data = json.loads(response.body)

        if data['currentpage'] < data['maxpages']:
            params = response.meta['_params_'].copy()
            params['pageno'] = str(data['currentpage'] + 1)
            yield FormRequest(self.ajax_products_url,
                              formdata=params,
                              headers=response.meta['_headers_'],
                              dont_filter=True,
                              meta={'_params_': params,
                                    '_headers_': response.meta['_headers_'],
                                    '_page_': params['pageno'],
                                    'cookiejar': self.current_cookie},
                              callback=self.parse_result)
        else:
            self.search_finished = True

        products = data['data']
        for product in products:
            product_url = urljoin_rfc(base_url, product['url'])
            product_url = add_or_replace_parameter(product_url, 'currency', 'GBP')
            yield Request(urljoin_rfc(base_url, product_url), callback=self.parse_product)
Beispiel #30
0
    def parse(self, response):
        base_url = get_base_url(response)

        data = json.loads(response.body)
        products = data['products']

        if products:
            page = int(data['p'])
            page += 1
            yield Request(
                add_or_replace_parameter(response.url, 'p', str(page)))
        else:
            return

        for product in products:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('identifier', product['products_id'])
            loader.add_value('sku', product['products_id'])
            loader.add_value('name', product['products_name'])
            loader.add_value('name', product.get('products_model'))
            loader.add_value('price', product['products_price_float'])
            loader.add_value('url', response.urljoin(product['link']))
            loader.add_value('brand', product['manufacturers_name'])
            loader.add_value('image_url',
                             response.urljoin(product['products_image']))
            yield loader.load_item()
Beispiel #31
0
 def test_add_or_replace_parameter(self):
     url = 'http://domain/test'
     self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'),
                      'http://domain/test?arg=v')
     url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
     self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
     self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')
     url = 'http://domain/test?arg1=v1'
     self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'),
                      'http://domain/test?arg1=v1;arg2=v2')
     self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
                      'http://domain/moreInfo.asp?prodID=20')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
     self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', url_is_quoted=True),
                      'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
     self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                      'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')