def parse_product(self, response): name = response.xpath("//h1[@itemprop='name']/text()").extract()[0] price = response.css('span.ProductPrice::text').extract() cats = response.xpath("//div[@id='ProductBreadcrumb']/ul/li//text()").extract()[1:-1] brand = ''.join(response.xpath("//*[@itemprop='brand']//text()").extract()).strip() if not brand: raise ValueError('brand') shipping_cost = 5 sku = response.xpath("//*[@itemprop='sku']/text()").extract()[0].strip() identifier = response.xpath("//input[@name='product_id']/@value").extract()[0] image_url = response.xpath("//*[@itemprop='image']/@src").extract()[0] loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price.pop()) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): soup = BeautifulSoup(response.body) # product list page products = soup.findAll('a', {'class': 'products-list__item'}) if products: for r in self.parse_category(response): yield r return # discontinued product discontinued = response.xpath( "//div[contains(@class, 'discontinued')]") if not discontinued: discontinued = 'Discontinued Product' in response.body if discontinued: return name = response.xpath("//h1[@itemprop='name']/text()").extract() if not name: name = soup.find('h1', {'itemprop': 'name'}).text price = re.findall( '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",', response.body_as_unicode())[0] stock = None brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re( 'by (.*)') if not brand: brand = soup.find('span', { 'itemprop': 'manufacturer' }).text.split('by ')[-1].strip() sku = re.search('"sku":"([^"]*)","product_id"', response.body_as_unicode()).group(1) identifier = re.search('"product_id":"([^"]*)"', response.body_as_unicode()).group(1) image_url = response.xpath("//img[@class='prod-image']/@src").extract() if not image_url: image_url = soup.find('img', {'itemprop': 'image'})['src'] cats = [] for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]: cats.append(''.join(el.xpath('.//text()').extract()).strip()) shipping_cost = '2.98' if float(price) < 49 else '0' loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_product(self, response): name = response.xpath("//h2/span[@itemprop='name']/text()").extract() if not name: name = response.xpath("//table//tr/td//h2/text()").extract() name = name[0] price = response.xpath("//span[@itemprop='price']/text()").re('[\d\.]+') if not price: price = response.xpath("//span[@class='pr-price']/strong/text()").re('[\d\.]+') price = price[0] stock = response.xpath("//*[@itemprop='availability']/@href").extract() if stock: if 'InStock' in stock[0]: stock = None else: stock = 0 else: stock = None cats = response.xpath("//div[@class='grid_10']/h1/a/text()").extract() brand = cats[-1] image_url = response.xpath("//img[@alt='{}']/@src".format(name)).extract() m = re.search("details(.*)\.html", response.url) if m: identifier = m.group(1) else: entryid = url_query_parameter(response.url, 'entryid') priceid = url_query_parameter(response.url, 'priceid') if not entryid or not priceid: raise KeyError("Not found entryid and priceid in url: {}".format(response.url)) identifier = entryid + priceid sku = identifier loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) yield loader.load_item()
def parse_product(self, response): soup = BeautifulSoup(response.body) if not soup.find('div', attrs={'class': 'product'}): retry_request = _retry_page(response) if retry_request: yield retry_request else: self.log( "Error parsing page, couldn't extract product name: %s" % response.url) return main_name = soup.find('div', attrs={'class': 'product'}).h1.text main_name = remove_entities(main_name) brand_el = soup.find( lambda tag: tag.name == 'td' and 'brand' in tag.text.lower()) brand = brand_el.findNextSibling('td').text.strip() if brand_el else '' cat_names = [ span.a.text for span in soup.find('div', attrs={ 'class': 'breadcrumbtrail' }).span.findAll('span') if span.a ][2:] image_url = soup.find('img', {'itemprop': 'image'}) image_url = image_url['src'] if image_url else None table = soup.find('table', id='responsive-table') options = soup.findAll('div', attrs={'class': 'option'}) if table: for row in table.findAll('tr'): # Skip head row if not row.td: continue name = row.find('span', attrs={'class': 'name'}).text name = remove_entities(name) if not _main_name_in_opt_name(main_name, name): name = main_name + ' ' + name identifier = row.find('span', attrs={'class': 'codenumber'}) if not identifier: self.errors.append( "Identifier not found for products on page: %s" % response.url) continue identifier = identifier.text price = row.find(_is_price_tag).text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item() elif options: main_id = response.url.split('.')[-2].split('p-')[-1] price = soup.find('span', attrs={'class': 'inctax'}).span.text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None results = {} for opt in options: opt_name = opt.label.span.text results[opt_name] = [] for subopt in opt.select.findAll('option'): subopt_name = subopt.text subopt_value = _soup_el_get_attr(subopt, 'value') if subopt_value == '0': continue results[opt_name].append({ 'id': remove_entities(subopt_name).replace('"', ''), 'name': opt_name + ': ' + subopt_name }) for opt_tuple in product(*results.values()): name = _build_opt_name(main_name, opt_tuple) identifier = _build_opt_id(main_id, opt_tuple) loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item()
def parse_product(self, response): try: # fall back to Beautiful Soup soup = BeautifulSoup(response.body) hxs = HtmlXPathSelector(response) container = soup.find('div', attrs={'class': 'nosto_product'}) brand = container.find('span', attrs={'class': 'brand'}).text cat_names = [el.text for el in soup.find("div", id='bct').findAll('a')][1:] main_id = container.find('span', attrs={'class': 'product_id'}).text availability = container.find('span', attrs={'class': 'availability'}).text image_url = soup.find('img', id='main-image').attrMap['src'] options = soup.find('table', id='sku-table') if not options: name = soup.find('div', id='product-page-info').find('h1').text price = container.find('span', attrs={'class': 'price'}).text loader = ProductLoaderWithNameStrip(Product(), selector=hxs) loader.add_value('brand', brand) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('name', name) loader.add_value('identifier', main_id) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('sku', main_id) loader.add_value('image_url', image_url) if availability.lower() == 'outofstock': loader.add_value('stock', 0) yield loader.load_item() else: option_ids = [] for opt in options.findAll('tr'): sec_id = opt.findAll('td')[1].find('small').text name = opt.findAll('td')[1].text.replace(sec_id, '') sec_id = sec_id.strip('(').strip(')') identifier = main_id + ':' + sec_id volts = get_volts_from_name(name) if volts is not None: identifier = identifier + ':' + volts pack_of = get_pack_of_from_name(name) if pack_of is not None: identifier = identifier + ':' + pack_of if identifier in option_ids: option_id = opt.find('input', attrs={'name': 'ID'}).get('value') identifier = identifier + ':' + option_id option_ids.append(identifier) price = opt.find('td', attrs={'class': 'price'}).text.strip(u'\xa3').strip('£') loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('brand', brand) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('sku', main_id) loader.add_value('image_url', image_url) if availability.lower() == 'outofstock': loader.add_value('stock', 0) yield loader.load_item() except IndexError as e: # try loading page again tries = response.meta.get('try', 0) if tries < 10: yield Request(response.url, callback=self.parse_product, dont_filter=True, meta={'try': tries + 1}) else: self.errors.append("Error scraping page %s: %s" % (response.url, str(e))) raise