def parse_page(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cats = hxs.select("//ul[@id='nav']//a/@href").extract() for url in cats: url_ = add_or_replace_parameter(urljoin_rfc(base_url, url), 'pagingSize', '10000') yield Request(url_, callback=self.parse_page) # next page hxs = HtmlXPathSelector(response) url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract() if url: url_ = add_or_replace_parameter(urljoin_rfc(base_url, url[0]), 'pagingSize', '10000') yield Request(url_, callback=self.parse_page) # products for z in hxs.select("//div[@class='products']//li"): # name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract() loader = ProductLoader(selector=z, item=Product()) loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_value('url', urljoin_rfc(base_url, z.select("@data-product-url").extract()[0].strip())) loader.add_xpath('name', ".//*[contains(@class, 'name')]/text()") loader.add_xpath('brand', ".//*[contains(@class, 'brand')]/text()") price = z.select(".//p[@class='price']/ins//text()") \ or z.select(".//p[@class='price']//text()") \ or z.select(".//p[@class='price']/del//text()") price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '') loader.add_value('price', price) yield loader.load_item()
def parse_sellers(self, response): hxs = HtmlXPathSelector(response) product = response.meta['product'] products = [] # Only is reviews enabled sellers = hxs.select( '//div[contains(@class, "merchant") and contains(@class, "product")]' ) for seller in sellers: price = seller.select( './/span[@class="currentPrice"]/ins/text()').extract()[0] seller_name = seller.select( './/p[@class="soldby"]/strong//text()').extract() try: shipping_cost = seller.select( './/div[@class="productPrices"]//span/text()').re( r'\+ ([\d,.]+)')[0] except: shipping_cost = '0,00' stock = seller.select( './/p[@class="availability"]/span[contains(@class, "available")]' '/i[@class="icon-ok"]').extract() l = ProductLoader(item=Product(), response=response) l.add_value('identifier', product['identifier'] + '-' + seller_name[0]) if self.use_main_id_as_sku: l.add_value('sku', product['identifier']) l.add_value('name', product['brand'] + ' ' + product['name']) l.add_value('category', product['category']) l.add_value('brand', product['brand']) l.add_value('url', product['url']) l.add_value('shipping_cost', self._encode_price(shipping_cost)) l.add_value('price', self._encode_price(price)) l.add_value('image_url', product['image_url']) l.add_value( 'dealer', 'Pix - ' + seller_name[0] if seller_name else 'Pixmania.com') if not stock: l.add_value('stock', 0) new_item = l.load_item() if 'metadata' in product: new_item['metadata'] = product['metadata'].copy() products.append(new_item) if self.collect_reviews: reviews_url = add_or_replace_parameter( self.reviews_url, 'filter.q0', 'productid:eq:%s' % product['identifier']) reviews_url = add_or_replace_parameter(reviews_url, 'offset.q0', '0') yield Request(reviews_url, meta={'products': products}, callback=self.parse_reviews) else: for item in products: yield item
def parse_stock(self, response): data = json.loads(response.body) item = response.meta.get('item') options = [ option for option in data['stocks'] if option['name'] == 'Colour' ] for option in options: p = copy.deepcopy(item) p['identifier'] += u'-{}'.format(option['sku']) p['sku'] = option['sku'] if not option['inStock']: p['stock'] = 0 p['name'] += ' {}'.format(option['value']) yield p if not options: size_opts = [ option for option in data['stocks'] if option['name'] == 'Size' ] if size_opts and not response.meta.get('size_parsed'): size = size_opts[0]['value'] stock_url = add_or_replace_parameter( self.stock_url.format(item['identifier']), 'attr', 'Size') stock_url = add_or_replace_parameter(stock_url, 'attrval', size) yield Request(stock_url, meta={ 'item': item, 'size_parsed': True }, callback=self.parse_stock) else: yield item
def parse_options(self, response): product = response.meta['product'] data = json.loads(response.body) total_attributes = int(data['total_attributes']) if total_attributes > 0: if total_attributes == 1: attribute = data['attributes'][0] attribute_id = str(attribute['id']) for value in attribute['values']: url = add_or_replace_parameter( response.meta['url'], 'attributes[' + attribute_id + ']', str(value['value_id'])) yield Request(url, callback=self.parse_selection, meta={'product': product}) elif total_attributes == 2: attribute = data['attributes'][0] attribute_id = str(attribute['id']) for value in attribute['values']: url = add_or_replace_parameter( response.meta['url'], 'attributes[' + attribute_id + ']', str(value['value_id'])) yield Request(url, callback=self.parse_options2, meta={'product': product}) else: yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # categories for url in hxs.select('//div[@id="top-menu"]//a/@href').extract(): url = urljoin_rfc(base_url, url) url = add_or_replace_parameter(url, 'pageSize', '96') yield Request(url) # sub-categories for url in hxs.select('//div[@class="category-title"]/a/@href').extract(): url = urljoin_rfc(base_url, url) url = add_or_replace_parameter(url, 'pageSize', '96') yield Request(url) # pages for url in hxs.select('//div[@class="pager"]//a/@href').extract(): url = urljoin_rfc(base_url, url) yield Request(url) products = [urljoin_rfc(base_url, url) for url in hxs.select('//article[contains(@class, "product-grid-item")]//div[@class="product-name"]/a/@href').extract()] for url in products: yield Request(url, callback=self.parse_product)
def parse_products_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) show_all = hxs.select('//form[@class="showall"]') if show_all: show_all = show_all[0] url = show_all.select('./@action').extract()[0] id_category = show_all.select( './/input[@name="id_category"]/@value').extract()[0] n = show_all.select('.//input[@name="n"]/@value').extract()[0] url = add_or_replace_parameter(url, 'id_category', id_category) url = add_or_replace_parameter(url, 'n', n) yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list, cookies={}, meta={'dont_merge_cookies': True}) else: urls = hxs.select('//*[@id="pagination"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list, cookies={}, meta={'dont_merge_cookies': True}) urls = hxs.select( '//ul[@class="product_list grid row"]//a[@class="product-name"]/@href' ).extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, cookies={}, meta={'dont_merge_cookies': True})
def parse_products_list(self, response): products = response.xpath('//div[contains(@class, "card--product")]') for product in products: presc = ' '.join(product.xpath('.//div[@class="links_widget"]/p/a/span/text()').extract()) if 'I Have a Private Prescription' in presc or 'I Need a Private Prescription' in presc or 'I Have an NHS Prescription' in presc: continue loader = ProductLoader(item=Product(), selector=product) name = product.xpath('.//h2/a/text()').extract()[0] loader.add_value('name', name) url = product.xpath('.//h2/a/@href').extract()[0] loader.add_value('url', url) identifier = product.xpath('.//div/button/@data-product-id').extract()[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) price = product.xpath('.//span[@class="special-price"]/span[@class="price"]/text()').extract() if not price: price = product.xpath('.//span[@class="regular-price"]/span[@class="price"]/text()').extract() price = extract_price(price[0]) loader.add_value('price', price) category = response.xpath('//nav[@class="breadcrumb"]//li/span/text()').extract() category = category[-1] if category else '' loader.add_value('category', category) if price < 40: loader.add_value('shipping_cost', 3.19) image_url = product.xpath('.//img[contains(@id, "product-collection-image")]/@src').extract() image_url = response.urljoin(image_url[0]) if image_url else '' loader.add_value('image_url', image_url) yield loader.load_item() url_list = products.xpath('.//h2/a/@href').extract() if products and url_list != response.meta.get('url_list', []): current_page = url_query_parameter(response.url,'p', '1') next_url = add_or_replace_parameter(response.url, 'infinitescroll', '1') next_url = add_or_replace_parameter(next_url, 'p', str(int(current_page)+1)) yield Request(next_url, callback=self.parse_products_list, meta={'url_list': url_list})
def parse_categories_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # categories for url in hxs.select( '//ul[@class="products-grid"]/li/a/@href').extract(): yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), callback=self.parse_categories_products) for url in hxs.select( '//ul[@class="products-grid"]//a[@class="subcategory-thumbnails-list-element-link"]/@href' ).extract(): yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), callback=self.parse_categories_products) for url in hxs.select( '//section//div[@class="editable-type"]/a/@href').extract(): yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), callback=self.parse_categories_products) for url in hxs.select( '//div[contains(@class, "editable-size")]//a/@href').extract(): yield Request(add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), callback=self.parse_categories_products) # products for url in hxs.select( '//h2[@class="product-name"]/a/@href|//a[@class="fmore"]/@href' ).extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
def parse_subcats_full(self, response): if 'error.xhtml' in response.url: retry_no = int(response.meta.get('retry_no', 0)) if retry_no < 10: for url in response.meta['redirect_urls']: meta = response.meta.copy() meta['retry_no'] = retry_no + 1 yield Request(url, dont_filter=True, meta=meta, callback=self.parse_subcats_full) return hxs = HtmlXPathSelector(response) subcats = hxs.select('//div[@id="categories"]//a/@href').extract() for cat in subcats: url = urljoin_rfc(get_base_url(response), cat) if (not '?' in url) and (not url.endswith('/')): url += '/' url = add_or_replace_parameter(url, 'sort-by', 'P_manufacturerPartNumber') url = add_or_replace_parameter(url, 'sort-order', 'asc') url = add_or_replace_parameter(url, 'view-type', 'List') url = add_or_replace_parameter(url, 'sort-option', 'Manufacturers+Part+Number') yield Request(url, callback=self.parse_subcats_full) pages = hxs.select('//div[@class="checkoutPaginationContent"]//noscript/a/@href').extract() for url in pages: yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_subcats_full) for product in self.parse_product_list(response): yield product
def parse(self, response): links = response.xpath('//a/@href[contains(., "replica")]').extract() for url in links: url = response.urljoin(url.strip()) url = add_or_replace_parameter(url, 'cur', 'USD') url = add_or_replace_parameter(url, 'p', '1') url = add_or_replace_parameter(url, 'pp', '96') yield Request(url, callback=self.parse_list)
def parse(self, response): url = add_or_replace_parameter(self.base_url, 'pageurl', response.url) url = add_or_replace_parameter(url, 'zone0', 'search') url = add_or_replace_parameter(url, 'searchoperator', response.meta['operator']) yield Request(url, meta={'url': response.url}, callback=self.parse_list)
def parse(self, response): categories = response.xpath('//div[@id="categoryFilterPanel"]//a') for cat in categories: cat_name = cat.xpath('span/text()').extract()[0] cat_id = cat.xpath('@data-bind').re(r'\d+')[0] cat_url = add_or_replace_parameter(response.url, 'cat', str(cat_id)) cat_url = add_or_replace_parameter(cat_url, 'p', '1') yield Request(cat_url, callback=self.parse_category, meta={'category': cat_name, 'page': 1})
def parse(self, response): links = response.xpath('//ul[@id="primary-nav"]/li[contains(a/text(), ' '"Replica")]//a/@href').extract() links += response.css('figure.box a::attr(href)').extract() for url in links: url = response.urljoin(url) url = add_or_replace_parameter(url, 'cur', 'GBP') url = add_or_replace_parameter(url, 'p', '1') url = add_or_replace_parameter(url, 'pp', '96') yield Request(url, callback=self.parse_list)
def parse_price_ranges(self, response): ranges = response.xpath( '//*[@id="idevfacet_SalePrice"]//a/@href').extract() for url in ranges: url = response.urljoin(url) url = add_or_replace_parameter(url, 'perpage', '300') url = add_or_replace_parameter(url, 'F_Sort', '1') yield Request(url, callback=self.parse_product_list, meta={'dont_merge_cookies': True})
def parse_brands(self, response): base_url = get_base_url(response) brands = response.xpath( '//div[contains(., "Select a brand")]/select[@id]/option/@value' ).extract() for url in brands: url = urljoin_rfc(base_url, url) url = urljoin_rfc(base_url, url) + '?items_per_page=500' url = add_or_replace_parameter(url, 'show_out_of_stock', '1') url = add_or_replace_parameter(url, 'currency', 'GBP') yield Request(url, callback=self.parse_products)
def parse_product(self, response): item_id = response.xpath( '//*[@id="item_details_item_id"]/@value').extract_first() image_url = response.xpath('//*[@id="imageMain"]/@src').extract_first() category = response.xpath( '//div[@class="ws-breadcrumb shared-width"]//a/text()').extract() url = 'http://www.doorsworld.co/ajax.get_exact_product.php?instart_disable_injection=true' url = add_or_replace_parameter(url, 'item_id', item_id) options_containers = response.xpath( '//div[@class="option_container clearfix"]//select') if options_containers: combined_options = [] for options_container in options_containers: element_options = [] for option in options_container.xpath('./option[@value!=""]'): option_id = options_container.xpath( './@id').extract_first().rsplit('_', 1)[0] option_name = option.xpath('./@value').extract_first() element_options.append((option_id, option_name)) combined_options.append(element_options) if len(options_containers) > 1: combined_options = list(itertools.product(*combined_options)) for combined_option in combined_options: option_url = url for option in combined_option: option_url = add_or_replace_parameter( option_url, 'attributes[{}]'.format(option[0]), option[1]) yield scrapy.Request(option_url, callback=self.parse_product_data, meta={ 'image_url': image_url, 'url': response.url, 'category': category }) else: for option in combined_options[0]: option_url = add_or_replace_parameter( url, 'attributes[{}]'.format(option[0]), option[1]) yield scrapy.Request(option_url, callback=self.parse_product_data, meta={ 'image_url': image_url, 'url': response.url, 'category': category }) else: yield scrapy.Request(url, callback=self.parse_product_data, meta={ 'image_url': image_url, 'url': response.url, 'category': category })
def parse(self, response): hxs = HtmlXPathSelector(response) # categories for url in hxs.select( '//div[@class="submenulist"]/div/a/@href').extract(): #hxs.select("//map[@name='m_entree_secteur_maison' or @name='m_entree_secteur_outil']/area/@href").extract(): url = urljoin_rfc(get_base_url(response), url) url = add_or_replace_parameter(url, 'sort', 'Title-asc') yield Request(url, callback=self.parse, meta=response.meta, errback=lambda failure, url=url, meta=response.meta: self.retry_download(failure, url, meta)) # subcategories for url in hxs.select('//li[@class="open "]/ul/li/a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) url = add_or_replace_parameter(url, 'sort', 'Title-asc') yield Request(add_or_replace_parameter(url, 'perPage', '100'), callback=self.parse, meta=response.meta, errback=lambda failure, url=url, meta=response.meta: self.retry_download(failure, url, meta)) # More categories for url in hxs.select( '//div[@class="arealist"]//li/a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) url = add_or_replace_parameter(url, 'sort', 'Title-asc') yield Request(url, callback=self.parse, meta=response.meta, errback=lambda failure, url=url, meta=response.meta: self.retry_download(failure, url, meta)) # products for product in self.parse_product_list(response): yield product next = hxs.select( '//div[@class="page-navigation"]/a[text()="suivant "]/@href' ).extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse, meta=response.meta, errback=lambda failure, url=url, meta=response.meta: self.retry_download(failure, url, meta))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product = response.meta['product'] stock = hxs.select('//span[@class="isinstock"]/text()').re(r"(\d+)") if stock: product['stock'] = int(stock[0]) url = '/on/demandware.store/Sites-alexanderandjamesEU-Site/de_DE/Cart-MiniAddProduct' url = urljoin_rfc(base_url, url) url = add_or_replace_parameter(url, 'pid', str(product['identifier'])) url = add_or_replace_parameter(url, 'Quantity', '1') yield Request(url, dont_filter=True, meta={'product': product, 'cookiejar': response.meta['cookiejar']}, callback=self.parse_shipping_price1)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select( "//div[@id='dj_equipment_landing_left']" "//div[@class='dj_equipment_navigation_contents']" "//div[@class='dj_equipment_navigation_records_text']" "/a/@href").extract() for url in categories: url = urljoin_rfc(base_url, url) url = urljoin_rfc(base_url, url) + '?items_per_page=500' url = add_or_replace_parameter(url, 'show_out_of_stock', '1') url = add_or_replace_parameter(url, 'currency', 'GBP') yield Request(url, callback=self.parse_products)
def parse_options(self, response): product = response.meta['product'] options_found = 0 try: ajax_url = response.meta['options_url'] data = json.loads(response.body) options = iter_product( *(map(lambda d: dict(attr_id=attr['id'], **d), attr['values']) for attr in data.get('attributes', []) if not attr['disabled'])) for options_selected in options: new_product = Product(product) for option in options_selected: options_found += 1 opt_id = 'attributes[%s]' % option['attr_id'] opt_value_id = option['value_id'] # new_product['identifier'] += ':' + opt_value_id new_product['name'] += ' ' + option['value'] ajax_url = add_or_replace_parameter( ajax_url, opt_id, opt_value_id) meta = response.meta.copy() meta['product'] = new_product yield Request(ajax_url, callback=self.parse_options_prices, meta=meta) except Exception, e: self.log('NO OPTIONS WARNING => %r' % e) yield product
def parse_subcategories(self, response): subcategories = response.xpath("//ul[@class='grid-family-list']//div[@class='img-product']/a/@href").extract() if not subcategories: subcategories = response.xpath("//ul[@class='grid-family-list']/li/a/@href").extract() for subcategory in subcategories: yield Request(response.urljoin(subcategory), callback=self.parse_subcategories) next_page = response.xpath("//li[@class='sprite bt-next']/a/@val").extract() if next_page: yield Request(add_or_replace_parameter(response.url, 'page', next_page[0]), callback=self.parse_subcategories) subcategory_id = response.meta.get('subcategory_id') or response.url.split('?')[0].split('-')[-1] yield FormRequest( url='http://www.monechelle.fr/catalog/category/ajaxproducts', formdata={ 'subcategoryid': subcategory_id, 'page': next_page[0]}, meta={'subcategory_id': subcategory_id, 'dont_retry':True}, callback=self.parse_subcategories ) products = response.xpath('//li[contains(@class, "product-card")]//a[@title]/@href').extract() for product_url in products: yield Request(response.urljoin(product_url), callback=self.parse_product)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select('//ul[@class="catblocks"]//a/@href').extract() for url in categories: yield Request(urljoin_rfc(base_url, url)) if not categories: new_url = add_or_replace_parameter(response.url, 'limit', '25') new_url = add_or_replace_parameter(new_url, 'mode', 'list') self._current_cookie += 1 yield Request(new_url, dont_filter=True, callback=self.parse_list, meta={'cookiejar': self._current_cookie})
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) urls = [ url for url in hxs.select('//*[@id="anav"]//li/a/@href').extract() if url not in ('http://www.wedo-beds.co.uk/checkout/cart/', ) ] urls.append('http://www.wedo-beds.co.uk/beds/types.html') urls.append('http://www.wedo-beds.co.uk/beds/size.html') urls.append('http://www.wedo-beds.co.uk/mattresses/sizes.html') urls.append('http://www.wedo-beds.co.uk/mattresses/types.html') urls.append('http://www.wedo-beds.co.uk/mattresses/firmness.html') urls.append('http://www.wedo-beds.co.uk/mattresses/brands.html') urls.append('http://www.wedo-beds.co.uk/headboards/sizes.html') urls.append('http://www.wedo-beds.co.uk/headboards/material.html') urls.append('http://www.wedo-beds.co.uk/headboards/styles.html') urls.append('http://www.wedo-beds.co.uk/headboards/types.html') # menu for url in urls: yield Request( add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), self.parse_categories_products)
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last, format_ = get_last_file('BI BRA File', files) file_path = os.path.join(HERE, 'biw_products.csv') if format_ == 'csv': sftp.get(last.filename, file_path) else: file_path_excel = os.path.join(HERE, 'biw_products.xlsx') sftp.get(last.filename, file_path_excel) excel_to_csv(file_path_excel, file_path) with open(file_path) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: try: brand = unicode(row['BI Brand'], errors='ignore').strip() if brand not in self.brands: self.brands.append(brand) except: pass url = 'http://busca.submarino.com.br/busca.php?results_per_page=90' for brand in self.brands: url = add_or_replace_parameter(url, 'q', brand) yield Request(url, callback=self.parse_products_list)
def parse_products_list(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) for link in response.xpath( '//ul[@class="nav-main-list-lvl-2"]//a/@href').extract(): url = urljoin_rfc(base_url, link) yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list) for link in response.xpath( '//div[@class="box-product-list-item-default js-product-link-parent"]/span/a/@href' ).extract(): url = urljoin_rfc(base_url, link) url = add_or_replace_parameter(url, 'pitems', '50') yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) for link in response.xpath( '//div[@class="col-md-10 box-title-pager hidden-print"]' '//ul[@class="list-inline list-pager"]//a/@href').extract(): url = urljoin_rfc(base_url, link) yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand_urls = hxs.select('//div[@id="rightcolumn"]//a/@href').extract() categories = hxs.select( '//*[@id="nav-container"]/ul/li/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(add_or_replace_parameter(url, 'curr', 'NOK'), callback=self.parse_subcategories) for brand_url in brand_urls: yield Request(add_or_replace_parameter( urljoin_rfc(base_url, brand_url), 'curr', 'NOK'), callback=self.parse_categories)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) data = demjson.decode(response.body) product = None for product in data['itemList']: product_loader = ProductLoader(item=Product(), selector=hxs) image_url = '//d39rqydp4iuyht.cloudfront.net/store/product/image/{}.gif'.format(product['id']) product_identifier = product['id'] product_name = product['name'] product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) product_loader.add_value('image_url', image_url) price = product['minPrice'] sku = '' for match in re.finditer(r"([\d,\.]+)", product_name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) product_loader.add_value('price', price) url = '/store/ck/item/' + str(product['id']) product_loader.add_value('url', urljoin_rfc(base_url, url)) yield product_loader.load_item() if product and product['dataPosition'] < data['numItems']: page = int(url_query_parameter(response.url, 'page')) + 1 url = add_or_replace_parameter(response.url, 'page', str(page)) yield Request(url)
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: products = hxs.select( '//div[@class="product_name"]/a/@href').extract() for product in products: yield Request(urljoin_rfc(get_base_url(response), product), callback=self.parse_product, meta=response.meta) if len(products) >= 500: index = int(url_query_parameter(response.url, 'beginIndex', 0)) url = add_or_replace_parameter(response.url, 'beginIndex', str(index + 500)) yield Request(url, callback=self.parse_products, meta=response.meta) except: log.msg('PAGE ERROR >>>') log.msg(str(response.body)) retry = response.meta.get('retry', 0) + 1 if retry <= 7: log.msg('Retry: ' + response.url) time.sleep(5) yield Request(response.url, dont_filter=True, callback=self.parse_products, meta={'retry': retry})
def parse_result(self, response): base_url = get_base_url(response) data = json.loads(response.body) if data['currentpage'] < data['maxpages']: params = response.meta['_params_'].copy() params['pageno'] = str(data['currentpage'] + 1) yield FormRequest(self.ajax_products_url, formdata=params, headers=response.meta['_headers_'], dont_filter=True, meta={'_params_': params, '_headers_': response.meta['_headers_'], '_page_': params['pageno'], 'cookiejar': self.current_cookie}, callback=self.parse_result) else: self.search_finished = True products = data['data'] for product in products: product_url = urljoin_rfc(base_url, product['url']) product_url = add_or_replace_parameter(product_url, 'currency', 'GBP') yield Request(urljoin_rfc(base_url, product_url), callback=self.parse_product)
def parse(self, response): base_url = get_base_url(response) data = json.loads(response.body) products = data['products'] if products: page = int(data['p']) page += 1 yield Request( add_or_replace_parameter(response.url, 'p', str(page))) else: return for product in products: loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product['products_id']) loader.add_value('sku', product['products_id']) loader.add_value('name', product['products_name']) loader.add_value('name', product.get('products_model')) loader.add_value('price', product['products_price_float']) loader.add_value('url', response.urljoin(product['link'])) loader.add_value('brand', product['manufacturers_name']) loader.add_value('image_url', response.urljoin(product['products_image'])) yield loader.load_item()
def test_add_or_replace_parameter(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'), 'http://domain/test?arg1=v1&arg2=v2&arg3=nv3') url = 'http://domain/test?arg1=v1' self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'), 'http://domain/test?arg1=v1;arg2=v2') self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), 'http://domain/moreInfo.asp?prodID=20') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', url_is_quoted=True), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')