def parse_products(self, response): products = response.xpath('//div[@id="productListings"]/article') self.log('{} products found'.format(len(products))) for product in products: try: identifier = product.xpath('.//div[@class="productListingPrice"]/a/@href').re(r'/m.*p(\d+)/')[0] price = product.xpath('.//section[@class="pricing"]/*/text()').re(r'[\d\.,]+')[0] except: continue product_data = self.products_data.get(identifier) if not product_data: continue loader = ProductLoader(item=Product(), selector=product) for field in ['identifier', 'name', 'url', 'image_url', 'category', 'brand']: loader.add_value(field, product_data.get(field) or '') loader.add_value('price', price) item = loader.load_item() metadata = MicheldeverMeta() for m in product_data['metadata']: metadata[m] = product_data['metadata'][m] item['metadata'] = metadata if not is_product_correct(item): continue item['metadata']['mts_stock_code'] = find_mts_stock_code(item, spider_name=self.name, log=self.log) yield item
def parse_ipcode(self, response): product = response.meta['product'] ip_code = response.xpath( '//div[@id="pdp_tb_info_sku"]/div[2]/text()').extract_first() self.ip_codes[product['identifier']] = ip_code or '' product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product
def parse_ipcode(self, response): product = response.meta['product'] ip_code = response.xpath('//form[@id="productForm"]//span[contains(@style, "color: #bbb")]/text()').re_first(r'IPC (.*)') self.ip_codes[product['identifier']] = ip_code or '' product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product
def parse_ipcode(self, response): ip_code = '' product = response.meta['product'] stock_number = response.xpath( '//p[@id="InnerPH_InnerPH_spec_detail"]//text()').re_first( r'Stock Number: (.*)') width, ratio, rim, load, speed = product['metadata'][ 'full_tyre_size'].split('/') size_speed = width + ratio + rim + speed if size_speed in stock_number: # ie: 2055516VBR7101 # 2055516V + BR + 7101 # Size + speed: 2055516V (width + ratio + rim + speed) # Manufacturer: BR (Two characters ID) # IP code: 7101 ip_code = stock_number.split(width + ratio + rim + speed)[-1][2:] self.ip_codes[product['identifier']] = ip_code product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name fitting_method = 'Delivered' loader.add_value('url', response.url) image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select('//form[@name="form1"]/@action').extract() if not identifier: yield self.retry_request(response) return identifier = identifier[0] loader.add_value('identifier', identifier) price = hxs.select( '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) brand = hxs.select( '//div[@class="hidden"]/input[@class="producerName"]/@value' ).extract() if not brand: yield self.retry_request(response) return brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) brand = re.sub(u'\u0119', u'e', brand) product_name = hxs.select( '//h1[@itemprop="name"]/text()')[0].extract().strip() product_name = re.sub(u'[:\u2122]', u'', product_name) product_name = product_name.replace(brand, '').strip() data = parse_pattern(product_name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(product_name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( product_name, response.url)) return loader.add_value('name', data['Name']) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in product_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run on flat' in product_name.lower( ) or 'run flat' in product_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in product_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get( manufacturer_mark, '') if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) product_data = response.meta['product_data'] width = product_data['Width'] aspect_ratio = product_data['Aspect Ratio'] rim = product_data['Rim'] speed_rating = product_data['Speed rating'] alt_speed = product_data['Alt Speed'] name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % ( width, rim, speed_rating.upper()) name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % ( width, rim, alt_speed.upper()) name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim) products = hxs.select( '//div[@id="product-listing"]//div[@class="product"]/..') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: url = product_el.select( './/div[@class="title"]/a/@href')[0].extract() except: continue loader.add_value('url', url) loader.add_value( 'identifier', product_el.select(".//span[@class='addcompare']/input/@id"). extract()[0].split(":")[1]) # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0]) loader.add_xpath('price', './/span[@class="prodPirce"]/text()') try: name = product_el.select( './/div[@class="title"]/a/text()')[0].extract() except: continue run_flat_found = is_run_flat(name) if not re.search(r'(\(.*\))', name): # name = name.replace('/', '') m = re.search(name_reg, name) if not m: m = name_parts = re.search(name_reg2, name) if not m: m = name_parts = re.search(name_reg3, name) if m: name_parts = m.groups() else: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join( map(str, [width, rim, speed_rating.upper()]))) continue else: name_parts = [] name_parts.append(name.split()[0]) load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name) if not load_rating_reg: load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name) if not load_rating_reg: self.log('Failed parsing ' + name) self.log('URL: ' + response.url) self.log('Params: ' + ", ".join( map(str, [width, rim, speed_rating.upper()]))) continue name_parts.append(load_rating_reg.groups()[0]) name_parts.append(' '.join(name.split()[1:]).split('(')[0]) loader.add_value( 'name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', '')) brand = name_parts[0] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src') m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = name_parts[1] if 'ROF' in name.upper() or 'RFT' in name.upper( ) or run_flat_found: m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in name.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join( (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) # m['alternative_speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code( name_parts[-1]) fuel = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "fuel_")]/@class' ).re(r'fuel_(\w)') m['fuel'] = fuel[0] if fuel else '' grip = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "grip_")]/@class' ).re(r'grip_(\w)') m['grip'] = grip[0] if grip else '' noise = product_el.select( './/div[@class="legislationContainer"]/ul[@class="legislation"]/li/a[contains(@class, "noise_")]/@class' ).re(r'_(\d+)') m['noise'] = noise[-1] if noise else '' product = loader.load_item() product['metadata'] = m if not is_product_correct(product): self.log('The product is not correct: %r' % product) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product next_page = hxs.select('//span[@class="nextlink"]/a/@href') if next_page: yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = hxs.select('//td[@class="tread"]/text()').extract() if not name: msg = "No name found on page: %s" % response.url self.errors.append(msg) self.log("[ERROR] %s" % msg) return loader.add_value('name', name[0]) brand = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()' ).extract()[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(brand)) fitting_method = 'Delivered' loader.add_value('url', response.url) out_of_stock = hxs.select( '//table[@class="single searchresults"]//span[@class="outofstock"]' ) if out_of_stock: loader.add_value('stock', 0) image_url = hxs.select( '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src' ).extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select( '//table[@class="single searchresults"]//form/input[@name="pid"]/@value' )[0].extract() loader.add_value('identifier', identifier) price = hxs.select( '//table[@class="single searchresults"]//td[@class="netprice"]/text()' )[0].extract() loader.add_value('price', price) name = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()' )[0].extract() data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( name, response.url)) return metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'rflat' in name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' if '*' in name: manufacturer_mark = '*' else: manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) title = hxs.select('//h2/text()')[0].extract() if 'winter' in title.lower(): return brand = title.split(' ')[0] price = hxs.select('//td[contains(text(), "1 Tyre")]/following-sibling::td[@class="align-right"]/strong/text()')[0].extract() # fix wrong product if brand.strip() == 'R27': loader.add_value('name', title.replace('XL', '').replace('RF', '')) brand = 'Toyo' else: loader.add_value('name', title.replace(brand, '').replace('XL', '').replace('RF', '')) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('identifier', '//div[@class="hiddenFields"]/input[@name="sku"]/@value') image_url = hxs.select('//div[contains(@class, "sidebar")]/div[@class="align-center"]/img/@src')[0].extract() loader.add_value('image_url', image_url) speed_rating = hxs.select('//table[@class="blank-table"]//strong[contains(text(), "Speed Rating:")]/parent::td/following-sibling::td/text()').extract()[0] load_rating = hxs.select('//table[@class="blank-table"]//strong[contains(text(), "Load Index:")]/parent::td/following-sibling::td/text()').extract()[0].replace(speed_rating, "") size = hxs.select('//h3/text()')[0].extract() width, aspect_ratio, _, rim = parse_tyre_size(size) if not width: msg = "Error parsing '%s' on page %s" % (size, response.url) self.log(msg) self.errors.append(msg) return m = MicheldeverMeta() m['aspect_ratio'] = aspect_ratio m['rim'] = rim m['width'] = width m['speed_rating'] = speed_rating.upper() m['load_rating'] = load_rating run_flat_found = is_run_flat(title) if 'RUNFLAT' in title.upper() or 'RF' in title.upper() or run_flat_found: m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in title.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join((m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) m['fitting_method'] = 'Fitted' m['manufacturer_mark'] = self._get_manufacturer_code(title) try: fuel, grip, noise = hxs.select('//div[@class="eu-label"]//span/text()').extract() except: fuel, grip, noise = ('', '', '') m['fuel'] = fuel m['grip'] = grip m['noise'] = noise.replace('dB', '') product = loader.load_item() product['metadata'] = m if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) yield product
def parse(self, response): row = response.meta['row'] products = response.xpath( '//div[contains(@class, "product-recommended")]') products += response.xpath( '//div[@class="product-section"]/div[contains(@class, "product")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.xpath( './/input[@name="brand"]/@value').extract() brand = brand[0] if brand else '' for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = ''.join(product_el.xpath('.//h2/text()').extract()) if not full_name: continue full_name_splt = re.split(brand, full_name, flags=re.I) tyre_code = full_name_splt[0] name = ' '.join(full_name_splt[1:]).strip() tyre_code = tyre_code.strip() name = name.strip() loader.add_value('name', name) # loader.add_value('name', full_name.split(brand)[-1]) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.xpath( './/input[@name="prodCode"]/@value').extract() if identifier: identifier = identifier[0] else: self.log('Product without identifier') search_params = '/'.join([ row['Aspect Ratio'], row['Rim'], row['Width'], row['Alt Speed'] ]) self.log('Search parameters: ' + search_params) return loader.add_value('url', response.url) image_url = product_el.xpath( './/div[contains(@class, "product-im")]/img/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('identifier', identifier) price = ''.join( product_el.xpath('.//*[@class="price"]//text()').re( r'[\d\.,]+')) if not price: continue loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] speed = re.search('(\s\d+\w+\s)', full_name) speed_rating = speed.group().strip()[-1] if speed else '' load_rating = speed.group().strip()[:-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if 'XL' in full_name.upper() else 'No' run_flat_found = is_run_flat(full_name) metadata['run_flat'] = 'Yes' if 'RUNFLAT' in full_name.upper( ) or run_flat_found else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = map( unicode.strip, product_el.xpath( './/div[contains(@class, "feature-image") or contains(@class, "feature-block")]' '//span[@class="icon-text"]/text()').extract()) except: fuel = '' grip = '' noise = '' metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product next_page = response.xpath( u'//ul[@class="pagination"]//a[contains(text(), ">")]/@data-url' ).extract() if next_page: yield Request(next_page[0], dont_filter=True, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name brand = response.meta.get('brand') or '' product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' base_loader.add_value('url', response.url) image_url = hxs.select('//div[@class="item"]/a/img/@src').extract() options = hxs.select('//div[@style="background: #fff; padding: 6px; "]') for option in options: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract() if not identifier: identifier = option.select('./a/@href').re('email_me_stock/(.*)') if not identifier: continue loader.add_value('identifier', identifier[0]) price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract() if price: loader.add_value('price', price[0]) else: if response.meta.get('price'): loader.add_value('price', response.meta['price']) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = option.select('./p/strong/text()').extract() if not pattern_name: pattern_name = option.select('./strong/text()').extract() pattern_name = pattern_name[0] data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url) log.msg(msg) self.errors.append(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_search(self, response): base_url = get_base_url(response) products = response.xpath('//ul[contains(@class, "c-list-classic") and contains(@class, "m-produit-res")]/li') pages = response.xpath('//ul[contains(@class, "paginator")]/li[not(@data-page="1")]/@data-page').extract() for product_el in products: url = product_el.xpath('.//a[contains(@class, "u-semi-link")]/@href')[0].extract() winter_tyre = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()').re('Winter') if not winter_tyre: loader = ProductLoader(item=Product(), selector=product_el) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name loader.add_xpath('name', './/span[@class="m-produit-bloc-res-lst__dcp"]/text()') brand = product_el.xpath('.//span[@class="m-produit-bloc-res-lst__fab"]/text()').extract() if brand: brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Delivered' loader.add_value('url', urljoin(base_url, url)) image_url = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__image"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = product_el.xpath('.//button/@data-id')[0].extract() loader.add_value('identifier', identifier) price = product_el.xpath('.//div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()')[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) name = product_el.xpath('.//div[@class="m-produit-bloc-res-lst__dim"]/text()')[0].extract().strip().replace(u'\xa0', u' ') data = parse_pattern(name) if not data: self.log('ERROR parsing "{}" [{}]'.format(name, response.url)) # self.errors.append('ERROR parsing "{}" [{}]'.format(name, response.url)) continue additional_data = ' '.join(product_el.xpath('.//ul[@class="m-produit__carac c-list-horizontale"]/li/text()').extract()) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in additional_data metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat('%s %s %s' % (loader.get_output_value('name'), name, additional_data)) run_flat = 'runflat' in additional_data.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [mark for mark in self.all_man_marks.keys() if re.search('\(?{}\)?'.format(mark.replace('*', '\*')), additional_data)] manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = map(unicode.strip, product_el.xpath('.//div[@class="m-produit-bloc-res-lst__etiq hide-for-small"]' '/ul[@class="m-etiq-light"]/li/div[contains(@class, "m-etiq-light__note")]/text()').extract()) except: fuel, grip, noise = ('', '', '') metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise.replace('dB', '') product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) yield product for page_no in pages: meta = response.meta.copy() yield Request(add_or_replace_parameter(self.search_url % meta['row'], 'page', page_no), meta=meta, callback=self.parse_search)
def parse_products(self, response): json_data = json.loads(response.body) products = json.loads(json_data.get('d')) for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) try: brand = product_el[u'ProductManufacturer'][ u'TyreManufacturerName'] except: brand = '' winter_tyre = product_el[u'ProductAttributes'][u'IsWinter'] # skip winter tyres if winter_tyre: continue for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand try: full_name = product_el[u'ProductTreadPattern'][u'TreadName'] except: full_name = '' # Fix name changes if full_name in self.new_old_names: full_name = self.new_old_names[full_name] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.get('TyreID') loader.add_value('url', 'http://www.tyresonthedrive.com') image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[ u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg' loader.add_value('image_url', image_url) loader.add_value('identifier', identifier) price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat'] if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = str( product_el[u'ProductAttributes'][u'Profile']) metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim']) metadata['speed_rating'] = str( product_el[u'ProductAttributes'][u'Speed']) metadata['load_rating'] = str( product_el[u'ProductAttributes'][u'Load']) metadata['width'] = str( product_el[u'ProductAttributes'][u'Section']) metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsExLoad'] else 'No' metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][ u'IsRunFlat'] else 'No' man_mark = product_el[u'ProductAttributes'][u'OEMFitment'] metadata['manufacturer_mark'] = find_man_mark( man_mark) if man_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): html_response = json.loads(response.body)['display_tyres'] hxs = HtmlXPathSelector(text=html_response) search_params = response.meta['search_params'] products = hxs.select('//div[contains(@class, "tyre_container")]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract() brand = brand[0] if brand else '' winter_tyre = hxs.select( '/div/div/div[@class="winter_img"]').extract() if not winter_tyre: for tyre_brand in self.brands: if tyre_brand.upper() == brand.strip().upper(): brand = tyre_brand full_name = product_el.select( './/form/span[@class="tyre_brand_text"]/text()').extract( )[-1] loader.add_value('name', full_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_el.select( './/input[@name="tyre"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('url', 'http://www.tyregiant.com') image_url = product_el.select( './/img[@class="tyre_image"]/@src').extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product_el.select( './/*[@class="tyre_price"]/span/text()').extract() if not price: loader.add_value('stock', 0) loader.add_value('price', price) metadata = MicheldeverMeta() metadata['aspect_ratio'] = search_params['aspect_ratio'] metadata['rim'] = search_params['rim'] tyre_details = product_el.select( './/form/p[@class="tyre_details"]/text()').extract()[0] speed = re.search('(\s\d+\w+\s)', tyre_details) load_rating = speed.group().strip()[:-1] if speed else '' speed_rating = speed.group().strip()[-1] if speed else '' metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = search_params['width'] metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = product_el.select( './/img[@class="xl_img"]/@src').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product_el.select( './/img[@class="rf_img"]/@src').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' metadata['manufacturer_mark'] = self._get_manufacturer_code( full_name) metadata['full_tyre_size'] = '/'.join( (search_params['width'], search_params['aspect_ratio'], search_params['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product if products: meta = response.meta next_page = meta['page'] + 1 next_url = 'http://www.tyregiant.com/update-tyres/%s' % str( next_page) meta['page'] = next_page yield Request(next_url, dont_filter=True, callback=self.parse_products, meta=meta)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select( '//ul[@class="c-list-classic c-list-classic-liste m-produit-res"]/li' ) next_page = hxs.select( '//li[a[span[text()="Next"]]]/@data-page').extract() # pagination if next_page: formdata = response.meta.get('formdata') formdata['page'] = next_page[0] yield FormRequest(response.url, formdata=formdata, dont_filter=True, meta=response.meta) for product_el in products: url = product_el.select( './/a[@class="u-semi-link"]/@href')[0].extract() winter_tyre = product_el.select( './/div[@class="m-produit-bloc-res-lst__gamme-saison"]/text()' ).re('Winter') if not winter_tyre: loader = ProductLoader(item=Product(), selector=product_el) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name loader.add_xpath( 'name', './/span[@class="m-produit-bloc-res-lst__dcp"]/text()') brand = product_el.select( './/span[@class="m-produit-bloc-res-lst__fab"]/text()' ).extract() if brand: brand = brand[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Delivered' loader.add_value('url', urljoin(base_url, url)) image_url = product_el.select( './/div[@class="m-produit-bloc-res-lst__image"]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) identifier = product_el.select( './/button/@data-id')[0].extract() loader.add_value('identifier', identifier) price = product_el.select( './/div[@class="c-qte-prix__prix m-produit-bloc-res-lst__prix"]/text()' )[0].extract() loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) name = product_el.select( './/div[@class="m-produit-bloc-res-lst__dim"]/text()' )[0].extract().strip().replace(u'\xa0', u' ') data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format( name, response.url)) self.errors.append('ERROR parsing "{}" [{}]'.format( name, response.url)) continue additional_data = ' '.join( product_el.select( './/ul[@class="m-produit__carac c-list-horizontale"]/li/text()' ).extract()) metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in additional_data metadata['xl'] = 'Yes' if xl else 'No' run_flat = 'runflat' in additional_data.lower() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if re.search('\(?{}\)?'.format(mark.replace('*', '\*')), additional_data) ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] products = hxs.select( '//*[@id="tyreResults"]//tr[contains(@class, "tyre")]//td[@class != "gutter"]' ) for product in products: loader = ProductLoader(item=Product(), selector=product) title = product.select('.//p[@class="subTitle"]/text()').extract() if not title: continue title = ' '.join(title[0].split()) parsed_title = parse_title_new(title) brand = parsed_title['brand'] load_rating = parsed_title['load_rating'] speed_rating = parsed_title['speed_rating'] name = parsed_title['name'] if not name or not brand: self.log( "++++++++++++++++++++++++++++{}==================".format( title)) # self.errors.append("Error parsing title: %s" % title) for fixed_brand, brand_spellings in self.brand_fixes.iteritems(): if brand.lower() in brand_spellings: brand = fixed_brand break brand = brand.title() if brand not in self.brand_fixes: self.log('Wrong brand %s' % brand) continue loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) price = product.select('.//h6[@class="price"]/text()').extract()[0] price += product.select( './/h6[@class="price"]/sup/text()').extract()[0] loader.add_value('price', extract_price(price)) identifier = product.select( './a[@class="btnBuy png_bg"]/@href').extract()[0] identifier = identifier.split('/')[-1] loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product.select( './/img[@class="tyreImg"]/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['onsite_name'] = title metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating self.log("===============matching================") self.log(str(name)) metadata['manufacturer_mark'], name = filter_man_code( name, self.all_man_marks, self.custom_man_marks) self.log(str((metadata['manufacturer_mark'], name))) metadata['xl'], name = filter_xl(name) metadata['xl'] = "Yes" if metadata['xl'] else "No" self.log(str((metadata['xl'], name))) run_flat_found = is_run_flat(name) metadata['run_flat'], name = filter_run_flat(name) metadata['run_flat'] = "Yes" if metadata[ 'run_flat'] or run_flat_found else "No" self.log(str((metadata['run_flat'], name))) self.log("===============/matching===============") if name.endswith('('): name = name[:-1] loader.add_value('name', name.strip()) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) # metadata['alternative_speed_rating'])) fuel = product.select( './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=fuel")]' ).re(r'rr=(\w)') metadata['fuel'] = fuel[0] if fuel else '' grip = product.select( './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=wet")]' ).re(r'wg=(\w)') metadata['grip'] = grip[0] if grip else '' noise = product.select( './/div[@class="tyreLabel"]/span/img[contains(@src, "icon=noise")]' ).re(r'db=(\d+)') metadata['noise'] = noise[0] if noise else '' prod = loader.load_item() prod['metadata'] = metadata if not is_product_correct(prod): continue prod['metadata']['mts_stock_code'] = find_mts_stock_code( prod, spider_name=self.name, log=self.log) yield prod
def parse(self, response): row = response.meta['row'] json_data = None for line in response.body.split('\n'): if "JsonObject = " in line: json_data = json.loads( line.replace('JsonObject = ', '').replace('; \r', '')) products = json_data['Rest'] + json_data['Deals'] collected_products = [] self.log('Results found {} {}'.format(len(products), response.meta)) for product_info in products: # skip winter tyres if product_info['WinterTyre']: continue loader = ProductLoader(item=Product(), selector=product_info) loader.add_value('name', product_info['ModelName']) brand = product_info['Manufacturer'] loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product_info['PrimaryId'] fitting_method = 'Fitted' if str(identifier) + '-' + fitting_method in self.seen_ids: continue url = '/catalogue' + product_info[ 'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId']) loader.add_value('url', response.urljoin(url)) image_url = product_info.get('ModelImageLarge') if not image_url: image_url = product_info.get('ModelImage') if image_url: image_url = image_url.split('src="')[-1].split('"')[0] loader.add_value('image_url', response.urljoin(image_url)) spec = product_info['SpecificationName'] metadata = MicheldeverMeta() # metadata['mts_stock_code'] = row['MTS Stockcode'] metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = spec.split()[-1] metadata['width'] = row['Width'] load_rating = product_info['LoadRatingName'] metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product_info['Reinforced'] metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(product_info['ModelName']) run_flat = product_info['RunFlat'] metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product_info['Variant'] if manufacturer_mark: manufacturer_mark = manufacturer_mark.split()[0].strip() full_tyre_size = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], metadata['load_rating'], metadata['speed_rating'])) # MOE Exception for this product if manufacturer_mark and 'MO EXTENDED' in product_info['Variant'].upper()\ and product_info['ModelName'] == 'Potenza S001' and full_tyre_size == '245/40/18/97/Y': metadata['manufacturer_mark'] = 'MOE' else: metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = full_tyre_size try: metadata['fuel'] = product_info['TyreLabelFuel']['Score'] except Exception: metadata['fuel'] = '' try: metadata['grip'] = product_info['TyreLabelWet']['Score'] except Exception: metadata['grip'] = '' try: metadata['noise'] = product_info['TyreLabelNoise'][ 'NoiseLevel'] except Exception: metadata['noise'] = '' product = loader.load_item() product['metadata'] = metadata product['price'] = product_info['FullyFittedPrice'] fitting_method = 'Fitted' product['identifier'] = str(identifier) + '-' + fitting_method product['metadata']['fitting_method'] = fitting_method t1 = time.time() if not is_product_correct(product): self.log('Search: {}'.format(str(response.meta))) self.seen_ids.add(str(identifier) + '-' + fitting_method) self.log('PRODUCT IS NOT CORRECT => %r' % product) continue t2 = time.time() self.log('Time taken by product correct: {}'.format(t2 - t1)) t1 = time.time() product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) t2 = time.time() self.log('Time taken by mts stock: {}'.format(t2 - t1)) collected_products.append(product) min_price_products = {} for product in collected_products: key = "%s-%s-%s-%s-%s-%s-%s" % ( product['brand'], product['name'], product['metadata']['fitting_method'], product['metadata']['full_tyre_size'], product['metadata']['xl'], product['metadata']['run_flat'], product['metadata']['manufacturer_mark']) if key in min_price_products: if product['price'] < min_price_products[key]['price']: min_price_products[key] = product else: min_price_products[key] = product for product in min_price_products.values(): self.seen_ids.add(product['identifier']) yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//tr[contains(@class,"tyre-search-row")]') next_page = [] if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), meta=response.meta) not_found_count = 0 for product in products: url = product.select('.//td/b/a/@href')[0].extract() winter_tyre = product.select('.//td/b/a/text()')[0].extract() winter_tyre = 'winter' in winter_tyre.lower() if not winter_tyre: brand = product.select('.//a/img/@src')[0].extract() brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title() product_name = product.select('.//td/b/a/text()')[0].extract() product_name = re.sub(brand, '', product_name).strip() fitting_method = 'Delivered' identifier = product.select( './/input[@name="item_id"]/@value').extract() if not identifier: identifier = product.select('.//a/@href').re( 'email_me_stock/(.*)') if not identifier: continue try: fuel, grip, noise = map( unicode.strip, product.select( './/img[contains(@alt, "Tyre Label")]/following-sibling::text()' ).extract()) except: fuel = '' grip = '' noise = '' price = product.select("td[3]/b/text()").extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier[0]) loader.add_value('name', product_name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('url', url) if price: loader.add_value('price', price[0]) else: loader.add_value('price', '0.00') loader.add_value('stock', 0) pattern_name = product.select('.//i/text()').extract() if not pattern_name: continue pattern_name = pattern_name[0] data = re.search( '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)', pattern_name) if data: data = data.groupdict() else: msg = 'ERROR parsing "{}" [{}]'.format( pattern_name, response.url) self.log(msg) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'].upper() metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] or '' metadata['alternative_speed_rating'] = '' xl = 'XL' in pattern_name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(pattern_name) run_flat = 'run flat' in pattern_name.lower( ) or 'runflat' in pattern_name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): not_found_count += 1 self.log('%s - PRODUCT IS NOT CORRECT: %r' % (not_found_count, product)) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) if product['url'] in self.images: product['image_url'] = self.images[product['url']] yield product else: yield Request(product['url'], callback=self.parse_image, meta={'product': product}, dont_filter=True)
def extract_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@class="listcontPART"]//div[@class="conprcbx"]') for el in products: brand = el.select('./div[@class="dec_tyrebnt"]/p/b/text()' ).extract().pop().strip() pattern = "".join( el.select( './div[@class="dec_tyrebnt"]/p/text()').extract()).strip() # skip winter tyres if 'winter' in pattern.lower(): continue xl, pattern = extract_reinforced(pattern) run_flat, pattern = extract_run_flat(pattern) res = parse_pattern(pattern) if not res: excludes = [ 'sport contact', 'advantage sport', 'expedia s02', 'zero rosso' ] if any([x in pattern.lower() for x in excludes]): continue else: msg = 'Could not parse pattern: %s' % fix_spaces( pattern).encode('utf-8') self.log('[CARTYRES] %s' % msg) self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res identifier = el.select(".//p/@onclick").re( "AddCarToShortList\('([^']*)',") url = self.start_urls[0] price = el.select( './/div[@class="dec_fittdbnt"]/h1/text()').extract().pop() price = fix_spaces(price) image_url = el.select( '../..//div[@class="uptyre_prt"]/img/@src').extract()[0] man_mark = el.select( './/div[@class="bndLGO1"]/img/@title').extract() if man_mark: man_mark = man_mark[0] if not man_mark in self.man_marks: self.man_marks.add(man_mark) else: man_mark = '' loader = ProductLoader(Product(), selector=hxs) loader.add_value('name', name) loader.add_value('identifier', identifier.pop()) loader.add_value('price', price) loader.add_value('url', url) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl if man_mark and man_mark in man_mark_mapping: man_code = man_mark_mapping[man_mark] else: man_code = '' metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_product_cache(self, identifier, price, out_of_stock, product): """ >>> spider = CamSkillSpider() >>> product = {\ "brand": "Pirelli", \ "category": 'R16" - 205/55/16, 205/55R16', \ "identifier": "113764", \ "image_url": "http://www.camskill.co.uk/smsimg/1943/113764--main--1943.jpg", \ "metadata": {\ "alternative_speed_rating": "", \ "aspect_ratio": "55", \ "fitting_method": "Delivered", \ "full_tyre_size": "205/55/16/91/V", \ "load_rating": "91", \ "manufacturer_mark": "", \ "mts_stock_code": "2055516VPIP7", \ "rim": "16", \ "run_flat": "No", \ "speed_rating": "V", \ "width": "205", \ "xl": "No"\ }, \ "name": "Cinturato P7", \ "price": "64.40", \ "sku": None, \ "stock": "0", \ "url": "http://www.camskill.co.uk/m62b0s291p113764/Pirelli_Tyres_Car_Pirelli_P7_Cinturato_Pirelli_P_7_-_205_55_R16_91V_TL_Fuel_Eff_%3A_E_Wet_Grip%3A_A_NoiseClass%3A_2_Noise%3A_70dB"\ } >>> spider.products_data['113764'] = product >>> product_ = spider.parse_product_cache("113764", 123, product) >>> product_['metadata']['mts_stock_code'] '2055516VPIP7CINT' """ loader = ProductLoader(item=Product(), selector=product) for col in ['name', 'identifier', 'sku', 'url', 'image_url', 'brand']: loader.add_value(col, self.products_data[identifier][col]) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) if out_of_stock: loader.add_value('stock', 0) product_ = loader.load_item() if identifier in self.products_metadata: product_['metadata'] = self.products_metadata[identifier] if not is_product_correct(product_): self.incorrect_identifiers.append(product['identifier']) return product_['metadata']['mts_stock_code'] = find_mts_stock_code( product_, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product_) new_alt_speed = get_alt_speed(product_) product_['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product_['metadata']['speed_rating'] if product_['metadata']['speed_rating'] != new_speed_rating else '' product_['metadata']['speed_rating'] = new_speed_rating return product_
def parse_search(self, response): base_url = get_base_url(response) urls = response.xpath('//div[@class="pagination tCenter"]//a/@href').extract() for url in urls: yield Request(urljoin(base_url, url), callback=self.parse_search) products = response.xpath('//*[@class="table search-results vCenter"]/tbody//tr') for product in products: season = product.xpath('.//i[contains(@class, "season")]/@class').extract() if season and 'winter' in season[0]: continue loader = ProductLoader(item=Product(), selector=product) brand = product.xpath('./td/a[@class="item-ref"]/span[1]/text()').extract()[0] name = product.xpath('./td/a[@class="item-ref"]/span[2]/text()').extract()[0] loader.add_value('name', name) pattern = product.xpath('./td/a[@class="item-ref"]/small/text()').extract()[0] data = extract_data(pattern) if data: width, aspect_ratio, rim, load_rating, speed_rating = data else: self.log("ERROR. Unable to parse pattern: %s" % pattern) continue if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) price = ''.join(product.xpath('.//div[@class="hidden-xs"]/span[@class="prix"]/text()').re(r'[\d\.,]+'))\ .replace('.', '').replace(",", ".") loader.add_value('price', extract_price(price)) identifier = product.xpath('@data-id').extract()[0] loader.add_value('identifier', identifier) url = product.xpath('./td[2]/a/@href').extract()[0] loader.add_value('url', urljoin(base_url, url)) image_url = product.xpath('./td[@class="img"]//img/@src').extract() if image_url: if len(image_url) < 250: loader.add_value('image_url', urljoin(base_url, image_url[0])) if self.old_meta_df is not None: old_meta = self.old_meta_df[self.old_meta_df['identifier'] == identifier] else: old_meta = None metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect_ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = 'Delivered' metadata['load_rating'] = load_rating specif = product.xpath('.//span[@class="specif"]/text()').extract() specif = [x.lower() for x in specif] metadata['xl'] = 'Yes' if 'xl' in specif else 'No' run_flat_found = is_run_flat('%s %s' % (name, ' '.join(specif))) metadata['run_flat'] = 'Yes' if ('runflat' in specif) \ or ('run flat' in ' '.join(specif)) or run_flat_found else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break if man_code == '': for code, man_mark in self.custom_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, speed_rating)) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): product_correct = False if (old_meta is not None) and (not old_meta.empty): product['metadata'] = dict(old_meta.iloc[0].metadata) try: product_correct = is_product_correct(product) except Exception, e: self.log('%r' % e) continue if not product_correct: continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) yield product
def parse_list(self, response): setattr(self, response.meta.get('thread'), True) hxs = HtmlXPathSelector(response) vs_data = hxs.select( '//input[@name="__VIEWSTATE"]/@value').extract()[0] identifiers = parse_identifiers(vs_data) products = hxs.select( '//div[@class="main-list"]//div[@class="group conti-box"]') for product_el in products: identifier = identifiers.pop(0) specif = product_el.select( './/span[@class="blue"]//div/text()').extract() # skip winter tyres if 'WINTER' in specif: continue loader = ProductLoader(item=Product(), selector=product_el) title = product_el.select( './/div[@class="conti-gray"]/text()').extract()[0] #identifier = title.split() title = title.strip().split('\r\n') name = title[-1].strip() width = title[0].split("/")[0].strip() ratio = title[0].split("/")[1].replace("R", "").strip() rim = title[1].strip() rating = title[2].strip() results = re.search(r"((?:\d{1,3}/)*(?:\d{1,3}))([A-Z]{1,2}\d?)", rating) if results: load_rating = results.group(1) speed_rating = results.group(2) else: load_rating = speed_rating = '' brand = product_el.select( './/div[@class="black-conti"]/text()').extract()[0].strip() brand = brand.title() if 'bfg' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) price = product_el.select( './/h4[@class="prc"]/text()').extract()[0] loader.add_value('price', extract_price(price)) #identifier = brand.replace(' ', '') + ''.join(identifier) loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product_el.select( './/div[@class="sec-img"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating #metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if 'REINFORCED' in specif else 'No' metadata['run_flat'] = 'Yes' if 'RUN FLAT' in specif else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: man_code = man_mark break if not man_code: for code, man_mark in self.custom_man_marks.iteritems(): if name.endswith(code): name = name.partition(code)[0] man_code = man_mark break metadata['manufacturer_mark'] = man_code loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, speed_rating)) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product for x in self.next_search(): yield x
def parse(self, response): products = response.xpath( '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]' ) for product in products: winter_tyre = product.xpath( '@data-filter-season').extract()[0] == 'Winter' if not winter_tyre: name = product.xpath( './/div[contains(@class, "tyre-model text-center")]/text()' ).extract()[0] brand = product.xpath('@data-filter-brand').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', brand + ' ' + name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.xpath('@data-tyreid').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) image_url = product.xpath( './/div[contains(@class, "tyre-image")]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product.xpath( './/div[contains(@class, "tyre-pricing-information")]/div/text()' ).re(r'[\d,.]+') price = price[0] if price else '0.00' loader.add_value('price', price) tyresize_text = product.xpath( './/div[contains(@class, "tyre-size")]/text()').extract( )[0].strip() try: width, aspect, speed_rating, rim, load_rating = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text, re.I).groups() except: width, aspect, speed_rating, rim = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() load_rating = '' fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product.xpath( '@data-filter-reinforced').extract()[0] == 'Y' metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(loader.get_output_value('name')) run_flat = product.xpath( '@data-filter-runflat').extract()[0] == 'Y' metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\ .re(r'Homologated for fitment to certain (.*) cars\.') metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark[0]) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr' '|@data-filter-tyreefficiencyg' '|@data-filter-tyreefficiencyd')\ .extract() metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product
def extract_products(self, response): products = response.xpath( '//div[@class="listcontPART"]//div[contains(@class, "conprcbx")]') for el in products: brand = map( unicode.strip, el.xpath('.//div[@class="imgBrandLogo"]/span/text()' '|.//div[@class="imgBrandLogo"]/img/@alt' '|.//b[@class="Brantrin"]/text()').extract())[0] pattern = ''.join( el.xpath('.//div[@class="dec_tyrerates"]//text()').extract() ).strip() # skip winter tyres if 'winter' in pattern.lower(): continue xl, pattern = extract_reinforced(pattern) run_flat, pattern = extract_run_flat(pattern) res = parse_pattern(pattern) if not res: excludes = [ 'sport contact', 'advantage sport', 'expedia s02', 'zero rosso' ] if any([x in pattern.lower() for x in excludes]): continue else: # msg = 'Could not parse pattern: %s' % fix_spaces(pattern).encode('utf-8') # self.log('[CARTYRES] %s' % msg) # self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res identifier = el.css('.hndSTCODE').xpath('text()').extract_first() url = self.start_urls[0] price = ''.join( el.xpath('.//div[@class="dec_fittdbnt"]//h2//text()').re( r'[\d\.,]+')) image_url = el.xpath( './/div[@class="uptyre_prt"]/img[@class="trIMG"]/@src' ).extract()[0] man_mark = el.xpath( './/div[@class="bndLGO1"]/img/@title').extract() if man_mark: man_mark = man_mark[0] if not man_mark in self.man_marks: self.man_marks.add(man_mark) else: man_mark = '' loader = ProductLoader(Product(), response=response) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', url) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl if man_mark and man_mark in man_mark_mapping: man_code = man_mark_mapping[man_mark] else: man_code = '' metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) fuel, grip, noise = map( unicode.strip, el.xpath( './/div[@class="dec_labelbnt"]/div[@class="decsec1"]/p/b/text()' ).extract()) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise.replace('dB', '') product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): # self.log('Product is not correct: %s' % repr(product)) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product
def find_mts_stock_code(self, product): return find_mts_stock_code(product, spider_name=self.name, log=self.log)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()') brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0] winter_tyre = product.select('div//img[@alt="Winter Tyre"]') if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0] loader.add_value('url', '') image_url = product.select('div[@class="image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) loader.add_value('identifier', identifier) price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract() price = re.findall(r"\d+.\d+", price[0]) if price else '0.0' loader.add_value('price', price) tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip() width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract() metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else '' metadata['alternative_speed_rating'] = '' xl = product.select('div//img[@title="Reinforced"]/@title').extract() metadata['xl'] = 'Yes' if xl else 'No' run_flat = product.select('div//img[@title="Run Flat"]').extract() metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract() manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else '' metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join((metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def extract_products(self, hxs, url): for el in hxs.select( '//div[starts-with(@class,"tyre_container round")]'): tyre_options = fix_spaces("".join( el.select( './/p[@class="tyre_details"]//text()').extract())).strip() if not tyre_options: msg = 'Could not extract tyre options from element from %s' % url self.log('ERROR: %s' % msg) self.errors.append(msg) continue res = parse_pattern(tyre_options) if not res: msg = "ERROR parsing: %s on %s" % (tyre_options, url) self.log(msg) self.errors.append(msg) continue width, ratio, rim, load_rating, speed_rating, name = res # skip winter tyres if el.select(".//div[@class='tyre_winter']"): continue name = name.strip() identifier = el.select("./@id").extract()[0] price = "".join( el.select( ".//p[@class='tyre_price']//text()").extract()).strip() if not price: continue brand = el.select( ".//span[@class='tyre_brand_text']/text()").extract()[0] image_url = el.select("img/@src").extract()[0] image_url = urljoin_rfc('http://asdatyres.co.uk', image_url) run_flat = 'Yes' if len( el.select(".//div[@class='tyre_rf']").extract()) > 0 else 'No' xl = 'Yes' if len( el.select(".//div[@class='tyre_xl']").extract()) > 0 else 'No' if xl == 'Yes': name = name.replace("XL", "").strip() loader = ProductLoader(Product(), selector=hxs) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', 'http://www.asdatyres.co.uk/') loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) metadata = MicheldeverMeta() metadata['width'] = width metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['load_rating'] = load_rating metadata['speed_rating'] = speed_rating metadata['fitting_method'] = 'Fitted' metadata['run_flat'] = run_flat metadata['xl'] = xl metadata['fitting_method'] = 'Fitted' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code in name: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (width, ratio, rim, load_rating, speed_rating)) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse(self, response): base_url = get_base_url(response) row = response.meta['row'] products = json.loads(response.body_as_unicode()) for product_el in products: #skip winter tyres if product_el['winter'] != '0': continue loader = ProductLoader(item=Product(), selector=product_el) brand = product_el['tyreMake'].title() if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) load_rating = product_el['loadrating'] speed_rating = product_el['tyreSpeed'] loader.add_value('price', product_el['priceVat']) loader.add_value('identifier', product_el['id']) loader.add_value( 'url', urljoin('http://www.etyres.co.uk/tyre-detail/', product_el['URLString'])) if product_el['tyreModelImage2']: image_url = 'images/' + product_el['tyreModelImage2'] if image_url: loader.add_value('image_url', urljoin(base_url, image_url)) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating metadata[ 'xl'] = 'Yes' if product_el['tyreReinforced'] == 'T' else 'No' metadata[ 'run_flat'] = 'Yes' if product_el['runflat'] == '1' else 'No' name = product_el['tyreModel'] man_code = '' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: man_code = man_mark break if not man_code: for code, man_mark in self.custom_man_marks.iteritems(): if name.endswith(code): name = name.partition(code)[0] man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) name = name.replace(' EXTRA LOAD', '') name = name.replace(' RUNFLAT', '') loader.add_value('name', name.strip()) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_search(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select( '//div[@class="pagination pagination-centered"]//a/@href').extract( ) for url in urls: yield Request(urljoin(base_url, url), callback=self.parse_search) #parse products list products = hxs.select('//*[@id="searchRes"]/tbody//tr') for product in products: season = product.select('.//td[4]/i/@class').extract() #skip winter tyres if season and 'ico-type ico-W' in season[0]: continue loader = ProductLoader(item=Product(), selector=product) brand, name = product.select('./td[2]/a/b/text()').extract() loader.add_value('name', name) pattern = product.select('./td[2]/a/small/text()').extract()[0] data = extract_data(pattern) if data: width, aspect_ratio, rim, load_rating, speed_rating = data else: self.log("ERROR. Unable to parse pattern: %s" % pattern) continue if 'goodrich' in brand.lower(): brand = 'BFG' loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) price = product.select('.//span[@class="pr"]/text()').extract()[0] price_decimals = product.select( './/span[@class="pr"]/sup/text()').extract()[0].replace( u'\xa3', '') loader.add_value('price', extract_price(price + price_decimals)) identifier = product.select('@data-id').extract()[0] loader.add_value('identifier', identifier) url = product.select('./td[2]/a/@href').extract()[0] loader.add_value('url', urljoin(base_url, url)) image_url = product.select('./td[1]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect_ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = 'Delivered' metadata['load_rating'] = load_rating specif = product.select( './/span[@class="specif"]/text()').extract() specif = [x.lower() for x in specif] metadata['xl'] = 'Yes' if 'xl' in specif else 'No' metadata['run_flat'] = 'Yes' if 'runflat' in specif else 'No' man_code = '' for code, man_mark in self.all_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break if man_code == '': for code, man_mark in self.custom_man_marks.iteritems(): if code.lower() in specif: man_code = man_mark break metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], load_rating, speed_rating)) #metadata['alternative_speed_rating'])) product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) new_speed_rating = get_speed_rating(product) new_alt_speed = get_alt_speed(product) product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \ product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else '' product['metadata']['speed_rating'] = new_speed_rating yield product
def parse_products_data(self, response): meta = { 'cookiejar': response.meta['cookiejar'], 'speed_rating': response.meta['speed_rating'], 'search_params': response.meta['search_params'], 'proxy_service_disabled': True, 'proxy': response.meta.get('proxy', ''), } # Pages for url in response.xpath( '//*[@id="InnerPH_InnerPH_pageList"]//a/@href').extract(): url = response.urljoin(url) if url not in self.urls_history: self.urls_history.add(url) yield Request(url, meta=meta, callback=self.parse_products_data, dont_filter=True) products = response.xpath('//ul[@id="results_tbl"]/li') if not products: products = response.xpath('//div[@class="product_item"]') if not products: self.log('No products found => %r' % response.meta) brand_list = response.xpath( '//ul[@id="InnerPH_InnerPH_brand_list"]//a/text()').extract() if not brand_list: self.log('No brand list found => %r' % response.meta) return for product in products: desc = product.xpath( './/div[@class="tyre_desc"]/text()').extract()[0] if 'snow' in desc or 'winter' in desc: continue search_params = response.meta['search_params'] name = product.xpath( './/a[@class="tyre_name"]/text()').extract()[0] url = product.xpath('.//a[@class="tyre_name"]/@href').extract()[0] p_id = product.xpath('.//a[@class="tyre_name"]/@href').re( r'/t(\d+)/')[0] image_url = product.xpath( './/*[contains(@class, "tyre_img")]//img/@src').extract()[0] try: brand = filter(lambda b: b in name, brand_list)[0] except: self.log('Can\'t detect brand for: %s' % name) continue try: price = product.xpath( './/*[@class="tyre_price_text"]/text()').extract()[0] except IndexError: self.log("Price not found: %s" % str(product)) continue loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.urljoin(url)) loader.add_value('identifier', p_id) loader.add_value('image_url', image_url) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) loader.add_value('price', price) pattern = name.strip() pattern = pattern.upper() pattern = pattern.replace('XL', '').replace('RFT', '').replace( 'RFLAT', '').replace('RUNFLAT', '').strip() loader.add_value('name', pattern) m = MicheldeverMeta() m['aspect_ratio'] = search_params['aspect_ratio'] m['rim'] = search_params['rim'] m['width'] = search_params['width'] m['speed_rating'] = search_params['speed_rating'].upper() res = re.search( '([\d/]+)%s' % search_params['speed_rating'].upper(), desc) if res: m['load_rating'] = res.groups()[0] else: self.log('ERROR: not load rating: %s' % url) m['load_rating'] = '' run_flat_found = is_run_flat(desc) if 'ZPS' in desc.upper() or 'RFT' in desc.upper() or 'RFLAT' in desc.upper() or \ 'RUNFLAT' in desc.upper() or 'RUN FLAT' in desc.upper() or run_flat_found: m['run_flat'] = 'Yes' else: m['run_flat'] = 'No' if 'XL' in desc.upper(): m['xl'] = 'Yes' else: m['xl'] = 'No' m['full_tyre_size'] = '/'.join( (m['width'], m['aspect_ratio'], m['rim'], m['load_rating'], m['speed_rating'])) m['fitting_method'] = 'Fitted' if 'FITTED' in product.xpath( './/*[@class="tyre_price_type"]/text()').extract( ) else 'Delivered' m['manufacturer_mark'] = self._get_manufacturer_code(desc) fuel = product.xpath( './/*[@class="fuel-img"]/@data-grade').extract() m['fuel'] = fuel[0] if fuel else '' grip = product.xpath( './/*[@class="wetgrip-img"]/@data-grade').extract() m['grip'] = grip[0] if grip else '' noise = product.xpath( './/*[@class="noise-img"]/@data-grade').extract() m['noise'] = noise[0] if noise else '' product = loader.load_item() product['metadata'] = m if not is_product_correct(product): self.log('Product is not correct => %s' % desc) continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) if product['identifier'] in self.ip_codes: ip_code = self.ip_codes[product['identifier']] product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product else: # We can't found IP code on products list, unfortunatelly we must extract it from product page yield Request(product['url'], meta={'product': product}, callback=self.parse_ipcode)
def parse(self, response): products = response.xpath('//div[@class="results"]') pages = response.xpath( '//p[contains(text(),"Page")]//a/@href').extract() for page in pages: yield Request(response.urljoin(page), meta=response.meta) for product in products: loader = ProductLoader(item=Product(), selector=product) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = ' '.join( map( unicode.strip, product.select('.//div[@class="resultsLeft"]/div' '//text()[normalize-space()]').extract())) name += name + ' %s' % ' '.join( map( unicode.strip, product.select( './/div[@class="t_size"]//text()[normalize-space()]'). extract())) loader.add_xpath( 'name', './/div[@class="resultsLeft"]/div//a/i/b/text()[normalize-space()]' ) brand = product.select( './/div[@class="resultsLeft"]/div/b//text()[normalize-space()]' ).extract()[0].strip() # skip winter tyres if product.select( './/img[contains(@alt,"Winter / cold weather tyres")]'): continue if product.select( './/img[contains(@alt,"Wi") or contains(@src,"/simg/hiver.png")]' ): continue loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) fitting_method = 'Fitted' url = product.select('.//a[i[b]]/@href')[0].extract() url = response.urljoin(url) url = re.sub('cart_id=[^&]*', '', url) loader.add_value('url', url) image_url = product.select( './/a/img[@align="left"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) identifier = urlparse.parse_qs( urlparse.urlparse(url).query)['typ'][0] loader.add_value('identifier', identifier) price = ''.join( product.select( './/div[@class="price"]/font/b//text()[normalize-space()]' ).extract()) price = re.findall(r"\d+.\d+", price) if price else '0.0' loader.add_value('price', price) data = parse_pattern(name) if not data: # log.msg("ERROR %s [%s]" % (name, response.url)) # self.errors.append("Error parsing: %s. URL: %s" % (name, response.url)) continue metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(name) run_flat = 'run flat' in name.lower() or 'runflat' in name.lower( ) or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' manufacturer_mark = product.select( './/div[@class="t_size"]/b/a[contains(@onmouseover,"Original") or ' 'contains(@onmouseover,"BMW") or contains(@onmouseover,"Porsche")]' '/@name[normalize-space()]').extract() manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = map( unicode.strip, product.select( './/div[@class="tyre_label_short"]//text()').extract()) metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise.replace('dB', '').strip() except: metadata['fuel'] = '' metadata['grip'] = '' metadata['noise'] = '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue if product['identifier'] in self.ip_codes: ip_code = self.ip_codes[product['identifier']] product['sku'] = ip_code product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log, ip_code=ip_code) yield product else: # We can't found IP code on products list, unfortunatelly we must extract it from product page yield Request(product['url'], meta={'product': product}, callback=self.parse_ipcode)