def test_path_to_file_uri(self): if os.name == 'nt': self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"), "file:///C:/windows/clock.avi") else: self.assertEqual(path_to_file_uri("/some/path.txt"), "file:///some/path.txt") fn = "test.txt" x = path_to_file_uri(fn) self.assert_(x.startswith('file:///')) self.assertEqual(file_uri_to_path(x).lower(), os.path.abspath(fn).lower())
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) products_list = [] if file_uri_to_path(response.url) == '/documents/specials/': products_list.extend( hxs.select('//a[@class="specialMore"]' '/@href').extract()) category = u'Specials' else: products_list.extend( hxs.select('//ul[@class="subdepartment_list"]' '/li/a/@href').extract()) category = file_uri_to_path(response.url)\ .split('/')[2].replace('+', ' ') for url in products_list: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products, meta={'category': category})
def test_file_uri_to_path(self): if os.name == 'nt': self.assertEqual(file_uri_to_path("file:///C:/windows/clock.avi"), "C:\\windows\clock.avi") uri = "file:///C:/windows/clock.avi" uri2 = path_to_file_uri(file_uri_to_path(uri)) self.assertEqual(uri, uri2) else: self.assertEqual(file_uri_to_path("file:///path/to/test.txt"), "/path/to/test.txt") self.assertEqual(file_uri_to_path("/path/to/test.txt"), "/path/to/test.txt") uri = "file:///path/to/test.txt" uri2 = path_to_file_uri(file_uri_to_path(uri)) self.assertEqual(uri, uri2) self.assertEqual(file_uri_to_path("test.txt"), "test.txt")
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = response.xpath('//*[contains(@class, "config-product-option")]' '//option[contains(@class, "single-option")]/@value').extract() if options: for option in options: param = '?conf=%(option)s' % ({'option': option}) yield Request(urljoin_rfc(response.url, file_uri_to_path(response.url) + param), callback=self.parse_product) dinamic_name = join(hxs.select('//*[@class="dynamic-name"]/text()') .extract()) stock = hxs.select('//div[@id="product_type_data"]/div[@class="availability"]/span[contains(@class, "spr-icon-dispo-big-10") or contains(@class, "spr-icon-dispo-big-50")]') categories = hxs.select('//div[@class="breadcrumbs"]//a/text()')[1:].extract() if dinamic_name: name = dinamic_name else: name = join(hxs.select('//li[@class="home"]//strong/text()') .extract()) if not name: name = join(hxs.select('//div[@id="product_description"]' '//div[@class="etiquette-title"]/span' '/span/text()').extract()) if not name: name = join(hxs.select('//li[@class="home"]' '//*[@class="store-main-color"]/text()') .extract()) if not name: name = join(hxs.select('//div[@class="product-name"]//h1/text()').extract()) sku = join(hxs.select('//div[@class="sku-product"]/span[@class="sku"]/text()') .extract()).strip() if not 'outilmania.fr' in response.url: price_url = hxs.select('//div[@class="price-box"]' '//img/@src').extract() if len(price_url) > 1: price_url = hxs.select('//div[@class="price-box"]' '//*[contains(@class, "special-price") ' 'or contains(@class, "normal-price")]' '//img/@src').extract() price_no_vat = response.xpath('//script/text()').re('"productPrice":"(.+?)"') if price_no_vat: price = (Decimal(price_no_vat[0]) * Decimal('1.2')).quantize(Decimal('0.01')) else: if price_url: params = { 'url': price_url[0], 'resize': '200', 'mode': '', 'blur':'1', 'format':'float'} # doing OCR decoding in 3 different modes to increase accuracy prices = [] price = '0.00' attempt = 0 while len(prices) == 0 and attempt < self.max_ocr_retries: attempt += 1 self.log('OCR: {}, attempt {}'.format(price_url[0], attempt)) for mode in ('6', '7', '8'): params_copy = params.copy() params_copy['mode'] = mode params_copy_encoded = urllib.urlencode(params_copy) ocr_service_url = "http://148.251.79.44/ocr/get_price_from_image?%s" % params_copy_encoded self.log('>>> GET PRICE => %s' % ocr_service_url) f = urllib.urlopen(ocr_service_url) jdata = json.loads(f.read()) self.log(str(jdata)) if len(jdata['price']) > 0: prices.append(jdata['price']) try: price = self._select_price(prices) except: # self.errors.append("Price error, posibly ocr error on " + response.url) pass price = price.encode('utf-8') price = price.replace(" ", "").replace(",", ".") log.msg(str(price), log.DEBUG) else: return else: faction = hxs.select('//form[@id="product_addtocart_form"]/@action').extract()[0] image_url = hxs.select('//div[@class="product-image-gallery"]/img[@id="image-main"]/@src').extract() yield Request(faction, meta={'name': name, 'url': response.url, 'sku': sku, 'identifier': sku, 'image_url': image_url, 'categories': categories}, callback=self.parse_outilmania_price, dont_filter=True) return if sku and price: loader = ProductLoader(response=response, item=Product()) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_xpath('image_url', '//div[@class="product-image-gallery"]/img[@id="image-main"]/@src') loader.add_value('stock', 1) for category in categories: loader.add_value('category', category) yield loader.load_item()
def __init__(self, uri): self.path = file_uri_to_path(uri)
def download_request(self, request, spider): filepath = file_uri_to_path(request.url) body = open(filepath, 'rb').read() respcls = responsetypes.from_args(filename=filepath, body=body) return respcls(url=request.url, body=body)