def test_path_to_file_uri(self):
        if os.name == 'nt':
            self.assertEqual(path_to_file_uri("C:\\windows\clock.avi"),
                             "file:///C:/windows/clock.avi")
        else:
            self.assertEqual(path_to_file_uri("/some/path.txt"),
                             "file:///some/path.txt")

        fn = "test.txt"
        x = path_to_file_uri(fn)
        self.assert_(x.startswith('file:///'))
        self.assertEqual(file_uri_to_path(x).lower(), os.path.abspath(fn).lower())
Exemple #2
0
    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        products_list = []

        if file_uri_to_path(response.url) == '/documents/specials/':
            products_list.extend(
                hxs.select('//a[@class="specialMore"]'
                           '/@href').extract())
            category = u'Specials'
        else:
            products_list.extend(
                hxs.select('//ul[@class="subdepartment_list"]'
                           '/li/a/@href').extract())
            category = file_uri_to_path(response.url)\
                                        .split('/')[2].replace('+', ' ')

        for url in products_list:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_products,
                          meta={'category': category})
    def test_file_uri_to_path(self):
        if os.name == 'nt':
            self.assertEqual(file_uri_to_path("file:///C:/windows/clock.avi"),
                             "C:\\windows\clock.avi")
            uri = "file:///C:/windows/clock.avi"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)
        else:
            self.assertEqual(file_uri_to_path("file:///path/to/test.txt"),
                             "/path/to/test.txt")
            self.assertEqual(file_uri_to_path("/path/to/test.txt"),
                             "/path/to/test.txt")
            uri = "file:///path/to/test.txt"
            uri2 = path_to_file_uri(file_uri_to_path(uri))
            self.assertEqual(uri, uri2)

        self.assertEqual(file_uri_to_path("test.txt"),
                         "test.txt")
Exemple #4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        options = response.xpath('//*[contains(@class, "config-product-option")]'
                             '//option[contains(@class, "single-option")]/@value').extract()

        if options:
            for option in options:
                param = '?conf=%(option)s' % ({'option': option})
                yield Request(urljoin_rfc(response.url,
                                          file_uri_to_path(response.url) + param),
                              callback=self.parse_product)

        dinamic_name = join(hxs.select('//*[@class="dynamic-name"]/text()')
                            .extract())
        stock = hxs.select('//div[@id="product_type_data"]/div[@class="availability"]/span[contains(@class, "spr-icon-dispo-big-10") or contains(@class, "spr-icon-dispo-big-50")]')
        categories = hxs.select('//div[@class="breadcrumbs"]//a/text()')[1:].extract()
        if dinamic_name:
            name = dinamic_name
        else:
            name = join(hxs.select('//li[@class="home"]//strong/text()')
                        .extract())

        if not name:
            name = join(hxs.select('//div[@id="product_description"]'
                                   '//div[@class="etiquette-title"]/span'
                                   '/span/text()').extract())
        if not name:
            name = join(hxs.select('//li[@class="home"]'
                                   '//*[@class="store-main-color"]/text()')
                        .extract())

        if not name:
            name = join(hxs.select('//div[@class="product-name"]//h1/text()').extract())

        sku = join(hxs.select('//div[@class="sku-product"]/span[@class="sku"]/text()')
                   .extract()).strip()
        if not 'outilmania.fr' in response.url:
            price_url = hxs.select('//div[@class="price-box"]'
                                   '//img/@src').extract()

            if len(price_url) > 1:
                price_url = hxs.select('//div[@class="price-box"]'
                                       '//*[contains(@class, "special-price") '
                                       'or contains(@class, "normal-price")]'
                                       '//img/@src').extract()
                
            price_no_vat = response.xpath('//script/text()').re('"productPrice":"(.+?)"')
            if price_no_vat:
                price = (Decimal(price_no_vat[0]) * Decimal('1.2')).quantize(Decimal('0.01'))
            else:
                if price_url:
                    params = {
                        'url': price_url[0],
                        'resize': '200',
                        'mode': '',
                        'blur':'1',
                        'format':'float'}

                    # doing OCR decoding in 3 different modes to increase accuracy
                    prices = []
                    price = '0.00'
                    attempt = 0
                    while len(prices) == 0 and attempt < self.max_ocr_retries:
                        attempt += 1
                        self.log('OCR: {}, attempt {}'.format(price_url[0], attempt))
                        for mode in ('6', '7', '8'):
                            params_copy = params.copy()
                            params_copy['mode'] = mode
                            params_copy_encoded = urllib.urlencode(params_copy)
                            ocr_service_url = "http://148.251.79.44/ocr/get_price_from_image?%s" % params_copy_encoded
                            self.log('>>> GET PRICE => %s' % ocr_service_url)
                            f = urllib.urlopen(ocr_service_url)
                            jdata = json.loads(f.read())
                            self.log(str(jdata))
                            if len(jdata['price']) > 0:
                                prices.append(jdata['price'])
                    try:
                        price = self._select_price(prices)
                    except:
                        # self.errors.append("Price error, posibly ocr error on " + response.url)
                        pass

                    price = price.encode('utf-8')
                    price = price.replace(" ", "").replace(",", ".")
                    log.msg(str(price), log.DEBUG)
                else:
                    return
        else:
            faction = hxs.select('//form[@id="product_addtocart_form"]/@action').extract()[0]
            image_url = hxs.select('//div[@class="product-image-gallery"]/img[@id="image-main"]/@src').extract()
            yield Request(faction,
                          meta={'name': name,
                                'url': response.url,
                                'sku': sku,
                                'identifier': sku,
                                'image_url': image_url,
                                'categories': categories},
                          callback=self.parse_outilmania_price,
                          dont_filter=True)
            return
        if sku and price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('name', name)
            loader.add_value('url', response.url)
            loader.add_value('price', price)
            loader.add_value('sku', sku)
            loader.add_value('identifier', sku)
            loader.add_xpath('image_url', '//div[@class="product-image-gallery"]/img[@id="image-main"]/@src')
            loader.add_value('stock', 1)
            for category in categories:
                loader.add_value('category', category)
            yield loader.load_item()
Exemple #5
0
 def __init__(self, uri):
     self.path = file_uri_to_path(uri)
Exemple #6
0
 def download_request(self, request, spider):
     filepath = file_uri_to_path(request.url)
     body = open(filepath, 'rb').read()
     respcls = responsetypes.from_args(filename=filepath, body=body)
     return respcls(url=request.url, body=body)