Python extract_price Exemples, product_spiders.utils.extract_price Python Exemples

Exemple #1

0

Afficher le fichier

    def parse(self, response):
        reader = csv.DictReader(StringIO(response.body))
        for row in reader:
            product_price = extract_price(row['SalePrice'])
            if not product_price:
                product_price = extract_price(row['Price'])
            product_name = ' - '.join([
                row['PaperType'], row['LaminationType'], row['PrintType'],
                row['PaperSize'], row['FoldingType']])
            product_identifier = row['ProductID']
            if row['ProdRange'] != 'NULL':
                product_category = row['ProductType'] + ' - ' + row['ProdRange']
            else:
                product_category = row['ProductType']

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', product_name)
            loader.add_value('identifier', product_identifier)
            loader.add_value('sku', product_identifier)
            loader.add_value('price', product_price)
            loader.add_value('category', product_category)

            item = loader.load_item()
            item['metadata'] = row.copy()
            yield item

Exemple #2

0

Afficher le fichier

Fichier : mytheresa_spdier.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_xpath('sku', '//h3[@class="sku-number"]/text()')
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = hxs.select(
            '//p[@class="special-price"]/span[@class="price"]/text()').extract(
            )
        if price:
            price = extract_price(price[0])
        else:
            price = hxs.select('//span[@class="price"]/text()').extract()
            if price:
                price = extract_price(price[0])
            else:
                price = 0

        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@id="main-image-image"]/@src')
        loader.add_xpath('brand', '//a[@itemprop="brand"]/text()')
        category = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[1]
        loader.add_value('category', meta['category'])
        loader.add_value('shipping_cost', 25)
        sold_out = hxs.select('//button[@class="btn-cart soldout"]')
        if sold_out:
            loader.add_value('stock', 0)
        yield loader.load_item()

Exemple #3

0

Afficher le fichier

Fichier : bestbuy.py Projet : oceancloud82/scraping

    def parse_dealers(self, response):
        item = response.meta['item']

        try:
            hxs = HtmlXPathSelector(response)
            dealers = hxs.select('//div[@class="product-list" and @data-condition="new"]')
        except Exception:
            dealers = []

        if not dealers and response.meta['one_seller']:
            log.msg('ERROR >>> ONE SELLER: ' + item['url'])
            return

        for dealer in dealers:
            dealer_name = ''.join(dealer.select('.//div[@class="seller-name"]/span/text()').extract()).strip()
            if dealer_name.upper() == 'BEST BUY':
                log.msg('INFO >>> COLLECT BEST BUY ITEM: ' + item['url'])

                out_of_stock = dealer.select('.//div[@class="cart-button" and @data-button-state-id="SOLD_OUT_ONLINE"]')
                if out_of_stock:
                    item['stock'] = 0

                price = dealer.select('.//div[@class="medium-item-price"]//text()').extract()
                if not price:
                    log.msg('ADD TO CART PRICE >>> ' + item['url'])
                    price = dealer.select('@data-price').extract()
                item['price'] = extract_price(price[-1])
                shipping_cost = dealer.select('.//div[@class="shipping-cost-puck"]//text()').extract()
                if shipping_cost:
                    item['shipping_cost'] = extract_price(shipping_cost[0])
                break

        if item['identifier']:
            self.new_ids.append(item['identifier'])
            yield item

Exemple #4

0

Afficher le fichier

    def parse_product(self, response):
        # inspect_response(response, self)
        # return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        tmp = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li[contains(@class,"category")]/a/text()'
        ).extract()
        if tmp:
            for s in tmp:
                loader.add_value('category', s)
        p = loader.load_item()
        product = response.meta['item']
        product['category'] = p['category']
        identifier = product['identifier']
        tmp = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li[@class="product"]/strong/text()'
        ).extract()
        if tmp:
            product['name'] = tmp[0]
        name = product['name']

        price = hxs.select('//div[@class="price"]/span/text()').extract()
        if not product['price'] and price:
            product['price'] = extract_price(price[0])

        options = hxs.select(
            '//select[@id="simple-selection"]/option[not(@value="null")]')
        if not options:
            tmp = hxs.select(
                '//div[@id="product-options"]//input[@id="sku-code"]/@value'
            ).extract()
            if tmp:
                product['sku'] = tmp[0]
            tmp = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').extract()
            if tmp and '/product/' in tmp[0]:
                product['identifier'] = tmp[0].split('/product/',
                                                     1)[1].split('/', 1)[0]
            yield product
            return
        for sel in options:  # ##
            item = copy.deepcopy(product)
            tmp = sel.select('text()').extract()
            if tmp:
                item['name'] = name + ' - ' + tmp[0]
            tmp = sel.select('@data-sku').extract()
            if tmp:
                item['identifier'] = identifier + '-' + tmp[0]
                item['sku'] = tmp[0]
            tmp = sel.select('@value').extract()
            if tmp:
                item['identifier'] = tmp[0]
            tmp = sel.select('@data-simple-price').extract()
            if tmp:
                price = round(extract_price(tmp[0]), 2)
                item['price'] = price

            yield item

Exemple #5

0

Afficher le fichier

Fichier : johncraddockltd.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brand = response.meta.get('brand', '')
        category = response.meta.get('category', '')
        sku = hxs.select('//input[@name="product_sku"]/@value').extract().pop()
        identifier = sku

        name = hxs.select('//h1[@class="gf_1"]/text()').extract()
        price = hxs.select('//span[@itemprop="price"]/text()').extract().pop()
        price = extract_price(price)
        # VAT
        price_vat = extract_price(str(float(price)*1.2))
        #image_url
        image_url = hxs.select('//img[@id="main_image"]/@src').extract()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        if image_url:
            loader.add_value('image_url', urljoin(response.url, image_url.pop()))
        if brand:
            loader.add_value('brand', brand)
        if category or brand:
            loader.add_value('category', category or brand)
        loader.add_value('name', name)
        loader.add_value('price', price_vat)
        if price < 50.00:
            loader.add_value('shipping_cost', '5.00')
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        #Stock
        #loader.add_value('stock', stock[0].strip())
        if sku not in self.ids_seen:
            self.ids_seen.add(sku)
            yield loader.load_item()

Exemple #6

0

Afficher le fichier

Fichier : soccerpro_com.py Projet : oceancloud82/scraping

 def parse_shipping(self, response):
     shipping_cost, free_shipping_over = response.xpath(u'//span[contains(text(),"FREE ECONOMY SHIPPING FOR ORDERS OVER $65.00")]/text()')\
                                                  .re(u'\((.*?) for orders less than (.*)\)')
     self.shipping_cost = extract_price(shipping_cost)
     self.free_shipping_over = extract_price(free_shipping_over)
     for url in self.start_urls:
         yield scrapy.Request(url)

Exemple #7

0

Afficher le fichier

Fichier : btpmat_spider.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta

        sku = hxs.select('//p[@class="alignright"]/text()').extract()[0].replace('[', '').replace(']', '')
        category = hxs.select('//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[-1]
        name = response.meta.get('name')

        price_box = hxs.select("//div[@itemprop='offers']//*[contains(text(),'HT')]/text()").extract()
        if price_box:	
            price_box = ''.join(price_box[0].split())
            price = re.findall(re.compile("[^0-9]*([0-9 .,]+).*"), price_box)[0].strip()
            price = extract_price(price)

        tax = hxs.select('//span[@class="weee"]/small/text()').extract()
        tax = extract_price(tax[0]) if tax else 0

        if not name:
            name = hxs.select('//div[@class="product-name"]/h1[@itemprop="name"]/text()').extract()

        l = ProductLoader(item=Product(), response=response)
        l.add_xpath('identifier', '//form[@id="product_addtocart_form"]//input[@name="product"]/@value')
        l.add_value('name', name)
        l.add_value('category', category)
        l.add_xpath('brand', '//div[@class="product-manufacturer"]/a/@title')
        l.add_value('sku', sku)
        l.add_value('url', response.url)
        l.add_value('price', price + tax)
        l.add_value('stock', 1)
        l.add_xpath('image_url', '//p[@class="product-image"]/a/img/@src')
        yield l.load_item()

Exemple #8

0

Afficher le fichier

Fichier : musiciansfriend_spider.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        options = hxs.select('//*[@id="buyingOptions"]/dl/dd/ul/li')
        log.msg('PARSE PRODUCT')
        base_product = response.meta['product']
        used_items = hxs.select('.//li[contains(@id,"usedItem")]')

        if options:
            for option in options:
                log.msg('PARSE PRODUCT OPTIONS')
                product_option = Product(base_product)
                full_name = ' '.join(
                    (base_product['name'],
                     ''.join(option.select('div/strong/a/text()').extract())))
                option_values = json.loads(
                    option.select(".//var/text()").extract()[0])
                sku = option_values['sku']
                product_option['name'] = full_name
                product_option['sku'] = sku
                product_option['identifier'] = sku
                price = ''.join(
                    option.select(
                        'div/p/span[@class="priceVal"]/text()').extract())
                if not price:
                    try:
                        price = json.loads(
                            option.select(
                                'var[contains(@class, "styleInfo")]/text()').
                            extract().pop())['price']
                    except KeyError:
                        return
                product_option['price'] = extract_price(str(price))
                if product_option['price'] > 0:
                    yield product_option
        else:
            if base_product['price'] > 0:
                yield base_product
        if used_items:
            for used_item in used_items:
                product_option = Product(base_product)
                full_name = ' '.join((base_product['name'], '(Used,', ''.join(
                    used_item.select(
                        './/p[contains(@class,"usedCondition")]/text()').
                    extract()), ')')).replace('\n', ' ')
                sku = used_item.select('.//fieldset/a/@href').re(
                    'url_catalog_ref_id=(.*?)&')[0]
                product_option['name'] = full_name
                product_option['sku'] = sku
                product_option['identifier'] = sku
                price = used_item.select(
                    './/p[contains(@class,"usedPrice")]/text()').extract()
                price = ''.join(price).replace('\n', '')
                decimal_price = used_item.select(
                    './/p[contains(@class,"usedPrice")]/sup[@class="decimalPrice"]/text()'
                )[0].extract()
                price = '.'.join([price, decimal_price])
                product_option['price'] = extract_price(str(price))
                if product_option['price'] > 0:
                    yield product_option

Exemple #9

0

Afficher le fichier

    def parse_product(self, response):
        sku = response.meta['sku']
        brands = response.meta['brands']

        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath('//input[@name="pid"]/@value').extract()
        if identifier:
            product_brand = response.meta.get('product_brand', None)
            if not product_brand:
                product_brand = re.findall('BRAND:</b>&nbsp; (.*)<br><b>W',
                                           response.body)
                if product_brand:
                    product_brand = product_brand[0].strip()
                else:
                    self.log('>>> ERROR: No brand found: ' + response.url)
                    return
                product_code = response.xpath('//span[@itemprop="name"]/text()'
                                              ).re('(.*) : ')[0].strip()
                if product_brand.upper(
                ) not in brands or product_code.upper() != sku.upper():
                    return

            identifier = identifier[0]
            name = response.xpath(
                '//span[@itemprop="name"]/text()').extract()[0].strip()
            price = response.xpath(
                '//font[@class="selling_price"]/b/text()').extract()[0]

            loader.add_value('identifier', identifier)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('url', response.url)
            loader.add_value('sku', sku)
            image_url = response.xpath(
                '//img[@itemprop="image"]/@src').extract()
            if image_url:
                loader.add_value('image_url', response.urljoin(image_url[0]))

            loader.add_value('brand', product_brand)

            weight = response.xpath(
                '//p[b[contains(text(), "Weight or Volume")]]/span/text()'
            ).extract()
            if weight:
                weight = weight[0].upper()
                # convert price to grams if it is in KG
                if 'KG' in weight:
                    weight = extract_price(weight) * 1000
                else:
                    weight = extract_price(weight)
                if weight > 1000:
                    loader.add_value('shipping_cost', 4.99)
                else:
                    loader.add_value('shipping_cost', 3.50)

            item = loader.load_item()
            yield item

Exemple #10

0

Afficher le fichier

Fichier : mattressonline_spider.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1/text()')
        product_loader.add_value('category', response.meta.get('category'))

        img = hxs.select(u'//img[@id="gallery-image"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        product_loader.add_xpath(
            'brand', u'//div[@class="manufacturer-logo"]/a/img/@alt')

        product = product_loader.load_item()
        for opt in hxs.select(
                u'//div[contains(@class,"purchase-options")]/div/form'):
            prod = Product(product)
            prod['name'] = prod['name'] + ' ' + opt.select(
                u'.//span[@class="option"]/text()').extract()[0].strip()
            prod['price'] = extract_price(
                opt.select(
                    u'.//input[contains(@id, "-base-sale-price")]/@value').
                extract()[0])
            prod['sku'] = opt.select(
                u'.//input[@name="product-id"]/@value').extract()[0]
            prod['identifier'] = opt.select(
                u'.//input[@name="product-id"]/@value').extract()[0]

            opt_groups = []
            for select in opt.select(
                    u'.//select/../../label[not(contains(text(),"Delivery"))]/../div/select'
            ):
                opts = []
                import logging
                for o in select.select(
                        u'./option[not(contains(text(), "None"))]'):
                    option = ''.join(o.select('.//text()').extract())
                    id = o.select('./@value').extract()[0]
                    try:
                        logging.error(option)
                        name, price = option.split('(')
                        price = extract_price(price)
                    except:
                        name, price = option, 0
                    opts.append((price, name, id))
                opt_groups.append(opts)

            for opt_price, opt_name, opt_id in multiply(opt_groups):
                p = Product(prod)
                p['name'] = p['name'] + ' ' + opt_name
                p['price'] = p['price'] + Decimal(opt_price).quantize(
                    Decimal('1.00'))
                p['identifier'] = p[
                    'identifier'] + ':' + opt_id if opt_id else p[
                        'identifier'] + '-'
                yield p

Exemple #11

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath(
            'name',
            '//h1/span[@class="name-name" or @class="name-range" or @class="name-size"]/text()'
        )
        product_loader.add_xpath('brand',
                                 u'//h1/a[@class="name-brand"]/text()')
        product_loader.add_xpath('sku', '//meta[@name="esc-sku"]/@content')
        product_loader.add_xpath(
            'category',
            '//div[@class="breadcrumbs"]/ul/li[position() > 1 and position() < last()-1]//a/text()'
        )
        img = hxs.select(
            u'//div[@class="product-image-wrapper"]//img/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        item = product_loader.load_item()
        metadata = FragranceDirectMeta()
        metadata['promotion'] = normalize_space(' '.join(
            hxs.select(
                '//div[@class="bubble-msg-container"]//text()').extract()))
        if item.get('price'):
            metadata['price_exc_vat'] = Decimal(
                product['price']) / Decimal('1.2')
        item['metadata'] = metadata

        for opt in hxs.select(
                '//table[@id="super-product-table"]//tbody/tr[not(contains(@class,"gwp"))]'
        ):
            p = Product(item)
            name = normalize_space(''.join(
                opt.select('./td[2]/text()').extract()))
            self.log("NAE %s" % name)
            if name not in p['name']:
                p['name'] = normalize_space(p['name'] + ' ' + name)

            p['identifier'] = opt.select(
                'normalize-space(substring-after(.//div[@class="product-code"]/text(), "#"))'
            ).extract()[0]
            p['price'] = extract_price(''.join(
                opt.select('.//span[starts-with(@id,"product-price-")]//text()'
                           ).extract()))
            if p['price'] < 30:
                p['shipping_cost'] = extract_price('1.95')
                p['price'] = p['price'] + p['shipping_cost']

            if p['price']:
                p['metadata']['price_exc_vat'] = Decimal(
                    p['price']) / Decimal('1.2')
            p['stock'] = 'in stock' in ''.join(
                opt.select('.//span[@class="stock-status-main"]/text()').
                extract()) and 1 or 0
            yield p

Exemple #12

0

Afficher le fichier

Fichier : boots.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        options = None
        js_line = ''
        for l in response.body.split('\n'):
            if 'variants:' in l:
                js_line = l
                break

        if js_line:
            options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip())

        product_loader = ProductLoader(item=Product(), selector=hxs)
        row = response.meta['row']
        sku = row['PRODUCT_NUMBER']
        product_loader.add_value('sku', sku)
        product_loader.add_value('identifier', sku)
        product_loader.add_value('url', response.url)
        name = hxs.select('//span[@itemprop="name"]/text()').extract()[0]
        product_loader.add_value('name', name)
        category = hxs.select('//*[@id="breadcrumb"]//a/text()').extract()[1:-1]
        product_loader.add_value('category', category)
        img = hxs.select('//meta[@property="og:image"]/@content').extract()
        if img:
            product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img.pop()))
        price = hxs.select('//p[@class="productOfferPrice"]/text()').extract()[0]
        price = extract_price(price)
        product_loader.add_value('price', price)
        brand = hxs.select('//*[@id="brandHeader"]/a/@href').extract()
        if brand:
            brand = brand[0].replace('/en/', '')[:-1]
            product_loader.add_value('brand', brand)
        stock = ''.join(hxs.select('//div[@class="cvos-availbility-panel"]/p/text()').extract())
        if 'Item is currently out of stock online' in stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        metadata = BootsMeta()
        prom = ''.join(hxs.select('//div[@class="productSavings"]//text()').extract())
        metadata['promotion'] = prom + ' ' + ''.join(hxs.select('//div[@class="primaryItemDeal"]//p/text()').extract())
        if product['price']:
            metadata['price_exc_vat'] = Decimal(product['price']) / Decimal('1.2')
        product['metadata'] = metadata

        yield product

        if options:
            for k, val in options.items():
                option_name = k.replace('_', ' ')
                option_product = Product(product)
                option_product['name'] = product['name'] + ' ' + option_name
                option_product['sku'] = val['productCode']
                option_product['identifier'] = val['variantId']
                option_product['price'] = extract_price(val['nowPrice'])
                if option_product.get('price'):
                    option_product['metadata']['price_exc_vat'] = Decimal(option_product['price']) / Decimal('1.2')

                yield option_product

Exemple #13

0

Afficher le fichier

Fichier : leaderstores.py Projet : oceancloud82/scraping

    def parse_node(self, response, node):
        identifier = node.select('./*[local-name()="id"]/text()')[0].extract()
        if identifier not in self.id_code_map:
            return
        product_code = self.id_code_map[identifier]
        loader = ProductLoader(item=Product(), selector=node)
        size = node.xpath('./*[local-name()="size"]/text()').extract()
        color = node.xpath('./*[local-name()="color"]/text()').extract()
        material = node.xpath('./*[local-name()="material"]/text()').extract()
        name = node.xpath('./*[local-name()="parent_title"]/text()').extract()
        if not name:
            name = node.xpath('./title/text()').extract()
        name = name[0]
        if material:
            name += u' {}'.format(material[0])
        if color:
            name += u' {}'.format(color[0])
        if size:
            name += u' {}'.format(size[0])
        price = node.xpath('./*[local-name()="price"]/text()').extract_first()
        pack_size = node.xpath('./description/text()').re(
            'Pack Size m: *([\d.]+)')
        if pack_size:
            price = extract_price(price) * extract_price(pack_size[0])

        loader.add_value('name', name)
        loader.add_xpath('url', './link/text()')
        loader.add_xpath('image_url', './*[local-name()="image_link"]/text()')
        loader.add_value('identifier', identifier)
        loader.add_value('price', price)
        loader.add_xpath(
            'shipping_cost',
            './*[local-name()="shipping"]/*[local-name()="price"]/text()')
        loader.add_xpath('brand', './*[local-name()="brand"]/text()')
        loader.add_xpath('category',
                         './*[local-name()="google_product_category"]/text()')
        loader.add_xpath('sku', './*[local-name()="mpn"]/text()')
        stock = node.xpath('./*[local-name()="availability"]/text()').extract()
        if stock and stock[0] == 'out of stock':
            loader.add_value('stock', 0)

        item = loader.load_item()

        if product_code in self.cost_prices:
            try:
                cost_price = Decimal(self.cost_prices[product_code])
            except:
                self.log('ERROR: unable to set cost price for item %r' % item)
            else:
                item['metadata'] = {'cost_price': str(cost_price)}

        if pack_size:
            yield Request(loader.get_output_value('url'),
                          self.parse_pack_price,
                          meta={'item': item})
        else:
            yield item

Exemple #14

0

Afficher le fichier

Fichier : chainreactioncycles_com.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1//text()')

        product_loader.add_xpath(
            'category',
            u'normalize-space(//a[@class="Link21" and position()=2]/text())')
        product_loader.add_xpath(
            'brand',
            u'//a[@id="ModelsDisplayStyle4_HlkSeeAllBrandProducts"]/@title')

        img = hxs.select(u'//div[@id="DivModelImage"]/a/@href').extract()
        if not img:
            img = hxs.select(
                u'//div[@id="DivModelImage"]/a/img/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        product_loader.add_xpath('brand',
                                 u'//span[@itemprop="manufacturer"]/text()')
        #            product_loader.add_xpath('shipping_cost', '')
        product = product_loader.load_item()
        for option in hxs.select(
                u'//div[@id="TabContentAddToBasketTab"]//tr[@class="BackGround15"]'
        ):
            prod = Product(product)
            prod['identifier'] = option.select(
                u'normalize-space(./td[1]/text())').extract()[0]
            prod['sku'] = option.select(
                u'normalize-space(./td[1]/text())').extract()[0]

            if option.select(u'./td[position()=1 and @colspan="4"]'):
                continue
            elif option.select(u'./td[4]//td[1]/text()').extract():
                prod['name'] = prod['name'].strip() + ' ' + option.select(
                    u'normalize-space(./td[3]/a/text())').extract()[0]
                prod['price'] = extract_price(
                    option.select(u'./td[4]//td[1]/text()').extract()[0])
            elif option.select(u'./td[6]//td[1]/text()').extract():
                prod['name'] = prod['name'].strip() + ' ' + option.select(
                    u'normalize-space(./td[4]/a/text())').extract()[0]
                prod['price'] = extract_price(
                    option.select(u'./td[6]//td[1]/text()').extract()[0])
                prod['identifier'] = option.select(
                    u'normalize-space(./td[2]/text())').extract()[0]
                prod['sku'] = option.select(
                    u'normalize-space(./td[2]/text())').extract()[0]
            elif option.select(u'./td[3]//td[1]/text()').extract():
                prod['price'] = extract_price(
                    option.select(u'./td[3]//td[1]/text()').extract()[0])
            else:
                continue
            if prod['identifier'].strip():
                yield prod

Exemple #15

0

Afficher le fichier

Fichier : matchesfashion_spider.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta

        l = ProductLoader(item=Product(), response=response)
        sku = meta.get('sku') if meta.get('sku', None) else hxs.select(
            '//div[@class="buy"]//select[@id="ProductItemId"]/@data-wpc'
        ).extract()[0]
        l.add_value('sku', sku)

        identifier = hxs.select(
            '//input[@id="AddToBasketMain"]/@data-event-label').extract()
        identifier = identifier[0] if identifier else sku

        l.add_value('identifier', identifier)
        brand = hxs.select(
            '//div[@class="info"]/h2[@class="designer"]/a/text()').extract()[0]
        name = hxs.select(
            '//div[@class="info"]/h3[@class="description"]/text()').extract(
            )[0].strip()
        l.add_value('name', brand + ' ' + name)

        brand = meta.get('brand') if meta.get('brand', None) else brand
        l.add_value('brand', brand)

        url = meta.get('url') if meta.get('url', None) else response.url
        l.add_value('url', url)

        image_url = hxs.select(
            '//a[@class="zoom"]/img[@class="product-image" and contains(@src, "_1_")]/@src'
        ).extract()
        if image_url:
            l.add_value('image_url', image_url[0])
        l.add_value('category', meta.get('category'))

        price = hxs.select(
            '//div[@class="details"]//div[@class="pricing"]/div[@class="price"]/span[@class="sale"]/text()'
        ).extract()
        if price:
            price = extract_price(price[0])
        else:
            price = hxs.select(
                '//div[@class="details"]//div[@class="pricing"]/div[@class="price"]/text()'
            ).extract()
            if price:
                price = extract_price(price[0])
            else:
                price = 0
        l.add_value('price', price)
        l.add_value('shipping_cost', 20)
        out_of_stock = hxs.select(
            '//div[@class="detail-allsizesoutofstock visible"]')
        if out_of_stock:
            l.add_value('stock', 0)
        yield l.load_item()

Exemple #16

0

Afficher le fichier

Fichier : newark.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        name = ' '.join(response.xpath('//h1/text()').extract()[0].split())
        identifier = response.xpath(
            '//input[@id="itemsArray"]/@value').extract()[0]
        sku = response.xpath('//span[@itemprop="mpn"]/text()').extract()
        sku = sku[0].strip() if sku else ''

        price = response.xpath('//span[@itemprop="price"]/text()').extract()
        price = extract_price(price[0]) if price else '0'

        brand = response.xpath(
            '//dd[contains(@itemtype, "Organization")]//a/text()').extract()
        brand = brand[0].strip() if brand else ''

        categories = response.xpath(
            '//div[@id="breadcrumb"]//a/text()').extract()[1:-1]

        product_image = response.xpath(
            '//img[@id="productMainImage"]/@src').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('price', price)

        if product_image:
            loader.add_value('image_url', response.urljoin(product_image[0]))

        loader.add_value('brand', brand)
        loader.add_value('category', categories)

        stock = response.xpath(
            '//span[contains(@class, "availability")]//text()').re('\d+')
        if not stock:
            loader.add_value('stock', 0)
        else:
            stock = extract_price(stock[0])
            loader.add_value('stock', stock)

        product = loader.load_item()
        metadata = TranscatMeta()
        metadata['reviews'] = []
        product['metadata'] = metadata

        reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=tkfeqezs3t1ybjthb77uxbvqd&apiversion=5.5&displaycode=1015-en_us&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + identifier + "&filter.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&sort.q0=submissiontime%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_4827"

        request = Request(reviews_url,
                          meta={
                              'product': product,
                              'offset': 0,
                              'identifier': identifier
                          },
                          callback=self.parse_reviews)
        yield request

Exemple #17

0

Afficher le fichier

Fichier : furniturechoice_co_uk.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        for url in hxs.select(
                '//div[@class="product-tile"]//a/@href').extract():
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_product)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        name = hxs.select('//h1/text()').extract()
        if not name:
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return
        product_loader.add_value('name', name)
        category = hxs.select(
            '//ol[@class="breadcrumbs"]//a/text()').extract()[1:]
        product_loader.add_value('category', category)
        img = hxs.select('//div[@class="item"]//img/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img.pop(0)))

        product = product_loader.load_item()
        options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li')
        if not options:
            options = hxs.select(
                u'//div[@class="SingColl"]/div[contains(@class, "Prod")]')
        if True:
            if not options or len(options) == 1:
                prod = Product(product)
                prod['sku'] = hxs.select('//div[@class="product-sku"]/text()'
                                         ).re('Product code: (\w+)').pop()
                prod['identifier'] = prod['sku']
                prod['price'] = extract_price(
                    hxs.select('//div[@class="price-current"]/text()').extract(
                    ).pop())
                if prod['identifier']:
                    yield prod
            else:
                for opt in options:
                    prod = Product(product)
                    prod['name'] = opt.select(
                        u'normalize-space(.//h2/text())').extract()[0]
                    prod['sku'] = \
                        opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0]
                    prod['identifier'] = prod['sku']
                    prod['price'] = extract_price(
                        opt.select(
                            u'.//span[@class="Price"]/text()').extract()[0])
                    yield prod

Exemple #18

0

Afficher le fichier

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     product_name = hxs.select('//h1[@itemprop="name itemreviewed"]/text()').extract()
     if not product_name:
         return
     product_name = product_name[0].strip()
     image_url = hxs.select('//div[@class="product_main"]//img[@itemprop="image photo"]/@src').extract()
     if not image_url:
         image_url = hxs.select('//img[@itemprop="image photo"]/@src').extract()
     brand = hxs.select('//a[@class="brand-link"]/img/@title').extract()
     sku = hxs.select('//p[@itemprop="identifier"]/@content').extract()[0]
     sku = sku.replace('sku:', '')
     options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body)
     if options_config:
         product_data = json.loads(options_config.groups()[0])
         products = {}
         for attr in product_data['attributes'].itervalues():
             for option in attr['options']:
                 for product in option['products']:
                     products[product] = ' - '.join((products.get(product, ''), option['label']))
         for identifier, option_name in products.iteritems():
             loader = ProductLoader(item=Product(), selector=hxs)
             loader.add_value('identifier', identifier)
             loader.add_value('name', product_data['childProducts'][identifier]['productName'])
             if image_url:
                 loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
             price = extract_price(product_data['childProducts'][identifier]['price'])
             price = price * self.exchange_rate * Decimal(1.2) + 100
             loader.add_value('price', price)
             loader.add_value('url', response.url)
             loader.add_value('category', response.meta.get('category', ''))
             if brand:
                 loader.add_value('brand', brand[0])
             loader.add_value('sku', sku)
             yield loader.load_item()
     else:
         loader = ProductLoader(item=Product(), selector=hxs)
         loader.add_value('url', response.url)
         loader.add_value('name', product_name)
         if brand:
             loader.add_value('brand', brand[0])
         if image_url:
             loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
         price = hxs.select('//div[@class="price-box"]//span[@class="price"]/text()').extract()
         if price:
             price = extract_price(price[0].replace(',', ''))
             price = price * self.exchange_rate * Decimal(1.2) + 100
         else:
             price = 0
         loader.add_value('price', price)
         loader.add_value('category', response.meta.get('category', ''))
         loader.add_value('sku', sku)
         identifier = hxs.select('//div[@class="no-display"]//input[@name="product"]/@value').extract()[0]
         loader.add_value('identifier', identifier)
         yield loader.load_item()

Exemple #19

0

Afficher le fichier

Fichier : biworldwide_base.py Projet : oceancloud82/scraping

    def parse(self, response):

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username = username, password = password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file(self.file_start_with, files)

        sftp.get(last.filename, self.xls_file_path)

        # Convert XLXS file to CSV
        excel_to_csv(self.xls_file_path, self.csv_file_path)

        with open(self.csv_file_path) as f:
            reader = csv.DictReader(f, delimiter=',')
            for row in reader:

                if row['BI ProductID'].lower() in self.identifiers:
                    continue

                self.identifiers.append(row['BI ProductID'].lower())
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', row['BI ProductID'])
                loader.add_value('sku', row['BI ProductID'])
                loader.add_value('category', unicode(row['BI Product Grp'], errors='ignore'))
                loader.add_value('category', unicode(row['BI CategoryGroup'], errors='ignore'))
                loader.add_value('name', unicode(row['BI ProductName'], errors='ignore'))
                loader.add_value('price', extract_price(row['BI ListPrice']))
                loader.add_value('shipping_cost', extract_price(row['BI Shipping']))
                loader.add_value('brand', unicode(row['BI Brand'], errors='ignore'))
                loader.add_value('url', '')
                if self.image_url_key:
                    image_url = row.get(self.image_url_key)
                    if image_url.lower() != 'na':
                        loader.add_value('image_url', image_url)
                else:
                    loader.add_value('image_url', '')
                product = loader.load_item()
                metadata = BIWordlwideMeta()
                metadata['dropship_fee'] =  unicode(row['BI Dropship Fee'], errors='ignore')
                metadata['est_tax'] =  unicode(row['BI Est Tax'], errors='ignore')
                metadata['ship_weight'] =  unicode(row['BI ship Wt'], errors='ignore')
                metadata['product_group'] =  unicode(row['BI Product Grp'], errors='ignore')
                metadata['upc'] =  unicode(row['BI UPC #'], errors='ignore')
                metadata['mpn'] =  unicode(row['BI Model'], errors='ignore')
                metadata['item_group'] = unicode(row.get('BI ItemGroup', ''), errors='ignore')
                for meta_key, feed_key in self.tag_keys.items():
                    tag = unicode(row.get(feed_key, ''), errors='ignore')
                    tag = tag if tag != u'NA' else u'N/A'
                    metadata[meta_key] = tag
                product['metadata'] = metadata
                yield product

Exemple #20

0

Afficher le fichier

Fichier : machinemart_co_uk.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1[1]/text()')
        price = ''.join(
            hxs.select(
                u'//div/span[position()=2 and contains(text(),"VAT") and contains(text(),"ex.")]/../span[1]//text()'
            ).extract())
        product_loader.add_value('price', extract_price(price))

        product_loader.add_xpath(
            'sku',
            u'substring-after(//div[contains(text(),"Product Code:")]/text(), ":")'
        )
        product_loader.add_xpath(
            'category',
            u'//span[@class="breadcrumblink" and position()=3]/a/text()')

        img = hxs.select(
            u'//a[starts-with(@id,"img") and contains(@class,"mainImageParent")]/@href'
        ).extract()
        if not img:
            img = hxs.select(
                u'//div[contains(@class,"proPicHolder")]/a/img/@src').extract(
                )
        product_loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))
        js = ''.join(hxs.select(u'//script/text()').extract())
        brand = re.search(u's.prop3=Trim\("(.+)"\);', js)
        if brand:
            product_loader.add_value('brand', brand.group(1))
#            product_loader.add_xpath('shipping_cost', '')
        product = product_loader.load_item()

        if not product['price'] and not product['sku']:
            rows = hxs.select(
                u'//table/tbody/tr[@id="rangeHeader"]/../tr[position()!=1 and position()!=last()]'
            )
            for i, row in enumerate(rows):
                if row.select(u'./td[2]/a/@href'):
                    # Comparison table with links to products
                    break

                p = Product(product)
                p['name'] = p['name'] + ' ' + row.select(
                    u'../tr[1]//table//tr[%d]/td/div/text()' %
                    (i + 2)).extract()[0]
                p['sku'] = row.select(u'./td[2]/text()').extract()[0]
                #p['price'] = extract_price(row.select(u'./td/div[@id="priceExcVAT1"]/text()').extract()[0])
                p['price'] = extract_price(
                    row.select(u'./td/div/div[2]/text()').extract()[0])
                yield p
        else:
            yield product

Exemple #21

0

Afficher le fichier

Fichier : smartbuyglasses.py Projet : oceancloud82/scraping

    def parse_size(self, response):
        item = response.meta['item']

        data = json.loads(response.body)
        item['price'] = extract_price(data['discount_price_promotion_display'])
        yield item
        if data['with_lens_price']:
            item['price'] = extract_price(data['with_lens_price'])
            item['identifier'] += '-with_lens'
            item['name'] += ' with lenses'
            yield item

Exemple #22

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('//h1//span[@itemprop="name"]/text()').extract()[0]

        brand = hxs.select('//h1//span[@itemprop="brand"]/text()').extract()
        brand = brand[0].strip() if brand else ''
        if brand.upper() in self.ignore_brands:
            return

        image_url = hxs.select(
            '//div[@class="productimg_container"]/a/img/@src').extract()
        identifier = hxs.select(
            '//*[@id="basketform"]//input[@name="product_id"]/@value').extract(
            )
        if not identifier:
            identifier = hxs.select('//span[@itemprop="sku"]/text()').extract()
        identifier = identifier[0]

        price = response.css('.product_price #exvatprice ::text').extract()
        if not price:
            price = response.xpath(
                '//span[@itemprop="price"]/text()').extract()
            if not price:
                self.log('Warning: no price found! %s' % response.url)
                return
        price = extract_price(price[0])

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        stock = hxs.select(
            '//span[@class="stockstatus"]/span[@class="stock" and contains(text(), "In Stock")]'
        )
        if not stock:
            loader.add_value('stock', 0)
        loader.add_value('identifier', identifier)
        loader.add_value('name', name)
        categories = hxs.select(
            '//div[@itemprop="breadcrumb"]//a/text()').extract()
        loader.add_value('category', categories)
        loader.add_value('brand', brand)
        loader.add_xpath('sku', '//td[@itemprop="mpn"]/text()')
        loader.add_value('url', response.url)
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        shipping = hxs.select(
            '//div[@class="delivery_text"]/strong/text()').extract()
        if shipping:
            if shipping[0] == 'FREE':
                loader.add_value('shipping_cost', 0)
            else:
                loader.add_value('shipping_cost', extract_price(shipping[0]))
        yield loader.load_item()

Exemple #23

0

Afficher le fichier

 def parse_shipping_price2(response):
     hxs = HtmlXPathSelector(response)
     product = response.meta['product']
     shipping = hxs.select('//tr[@class="ordershipping"]/td[2]/span/text()').extract()
     if shipping:
         shipping = extract_price(shipping[0])
         shipping_discount = hxs.select('//tr[@class="ordershippingdiscount discount"]/td[2]/span/text()').extract()
         if shipping_discount:
             shipping -= extract_price(shipping_discount[0])
         product['shipping_cost'] = shipping
     yield product

Exemple #24

0

Afficher le fichier

Fichier : cyclingexpress_com.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        name = hxs.select('//*[@id="Pbox"]/nav[1]/a[3]/text()').extract()[0]
        brand = ''
        for b in self.brands:
            if name.upper().startswith(b.upper()):
                brand = b
                break
        identifier = re.findall('roduct.ia\/(\d+)\/', response.url)[0]
        image_url = hxs.select('//*[@id="thumbnail"]//li/a/@href').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        category = response.meta.get('categories')
        products = hxs.select(
            '//*[@id="priceBox"]/form/table/tr[position()>1]')
        for product in products:
            item = Product()
            list_price = extract_price(
                product.select(
                    './td[2]/span[@class="tGrey"]/text()').extract()[0])
            metadata = CRCMeta()
            metadata['rrp'] = list_price
            item['metadata'] = metadata

            loader = ProductLoader(item=item, selector=product)
            option_name = product.select(
                './td/b[@class="name"]/text()').extract()[0]
            loader.add_value('name', name + ', ' + option_name)
            loader.add_value('brand', brand)
            loader.add_value('url', response.url)
            loader.add_value('category', category)
            loader.add_value('image_url', image_url)
            option_id = product.select('./td/div/button/@onclick').extract()[0]
            if 'proEmailMe' in option_id:
                option_id = option_id.split('proEmailMe(')[1].split(
                    ')')[0].replace("'", '').split(',')
            else:
                option_id = option_id.split('shopCartAdd(')[1].split(
                    ')')[0].replace("'", '').split(',')
            option_id = option_id[0].strip() + '_' + option_id[1].strip()
            loader.add_value('identifier', identifier + '_' + option_id)
            price = product.select('./td[2]/b/text()').extract()
            price = extract_price(price[0])
            loader.add_value('price', price)
            sku = product.select('./td[1]/div/text()').extract()
            sku = sku[0] if sku else ''
            loader.add_value('sku', sku)
            stock = product.select(
                './td[1]/span[@class="tGrey"]/text()').extract()[0]
            if stock != 'In Stock':
                loader.add_value('stock', 0)
            yield loader.load_item()

Exemple #25

0

Afficher le fichier

    def parse(self, response):
        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)

        file_path = HERE + '/Lakeland_products.csv'
        sftp.get('Lakeland.csv', file_path)

        with open(file_path) as f:
            reader = csv.DictReader((line.replace('\x00', '') for line in f),
                                    delimiter="|")
            for row in reader:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('sku', row['Unique Product Code'])
                loader.add_value('identifier', row['Unique Product Code'])
                loader.add_value('name', row['Product Name'])
                loader.add_value('category', row['Category'])
                loader.add_value('image_url', row['Image URL'])

                loader.add_value('brand', row['Brand'].decode('latin-1'))
                loader.add_value('url', row['Product Page URL'])
                list_price = str(round(extract_price(row['List Price']), 2))
                cost_price = str(round(extract_price(row['Cost Price']), 2))
                rrp = str(round(extract_price(row['RRP']), 2))
                selling_price = round(extract_price(row['Price']), 2)
                if selling_price > 0:
                    margin = (
                        (Decimal(rrp) / Decimal('1.2') - Decimal(cost_price)) /
                        Decimal(selling_price)) * Decimal('1.2')
                else:
                    margin = Decimal('0.00')
                margin *= Decimal('100')
                margin = '{}%'.format(str(round(extract_price(str(margin)),
                                                2)))
                loader.add_value('price', selling_price)
                loader.add_value('stock', row['Stock Availability'])
                loader.add_value('shipping_cost', row['Shipping Cost'])
                item = loader.load_item()
                metadata = LakelandMeta()
                metadata['margin'] = margin
                metadata['promotional_message'] = row['Promotional Message']
                metadata['buyer_name'] = row['Buyer Name']
                metadata['list_price'] = list_price
                metadata['cost_price'] = cost_price
                metadata['asin'] = row['ASIN']
                metadata['dd'] = 'Yes' if row['DD'] == '1' else ''
                metadata['rrp'] = rrp
                item['metadata'] = metadata
                yield item

Exemple #26

0

Afficher le fichier

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        name = response.xpath('//h1[@itemprop="name"]/text()').extract()
        if name:
            name = name[0].strip()
        else:
            retry_count = response.meta.get('retry_count', 0)
            if retry_count < 3:
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_product,
                              meta={'retry_count': retry_count + 1})
            else:
                self.log('Product without name: ' + response.url)
                return

        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value('brand', response.meta.get('brand'))
        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            image_url = response.urljoin(image_url[0])
            loader.add_value('image_url', image_url)
        available = ''.join(response.xpath('//div[@itemprop="offers"]//div[contains(@class,"tr-prod-availability")]//text()')
                            .extract())\
                      .strip().upper()
        if available:
            if 'AVAILABLE IMMEDIATELY' not in available.upper():
                loader.add_value('stock', 0)
        price = response.xpath(
            '//*[@itemprop="price"]/following-sibling::span/text()').extract()
        euro_price = response.xpath(
            '//*[@itemprop="price"]/text()').extract_first()
        price = extract_price(price[0]) if price else 0
        euro_price = extract_price(euro_price) if euro_price else 0
        loader.add_value('price', price)
        category = response.xpath(
            '//ul[@class="tr-sidebar-categories-main"]/li/a/text()').extract()
        if category:
            loader.add_value('category', category[0])
        sku = response.xpath('//input[@name="ar"]/@value').extract()
        sku = sku[0] if sku else ''
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)
        if int(price) <= 165:
            loader.add_value('shipping_cost', 8.3)
        item = loader.load_item()
        item['metadata'] = {'Euro Price': euro_price}
        yield item

Exemple #27

0

Afficher le fichier

    def parse(self, response):

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file("BI UK File", files)

        file_path = HERE + '/biwuk_products.csv'
        sftp.get(last.filename, file_path)

        with open(file_path) as f:
            reader = csv.DictReader(f, delimiter=',')
            for row in reader:
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', row['BI UK ProductID'])
                loader.add_value('sku', row['BI UK ProductID'])
                categories = unicode(row.get('BI UK CategoryGroup'),
                                     errors='ignore').replace(
                                         '>>', '>').replace("'", "").split('>')
                for category in categories:
                    loader.add_value('category', category.strip())
                loader.add_value(
                    'name', unicode(row['BI UK ProductName'], errors='ignore'))
                loader.add_value('price',
                                 extract_price(row['BI UK Delivered Price']))
                loader.add_value('shipping_cost',
                                 extract_price(row['BI UK Shipping']))
                loader.add_value('brand', row['BI UK Brand'])
                loader.add_value('url', '')
                loader.add_value('image_url', row['BI UK ImgURL'])
                product = loader.load_item()
                metadata = BIWordlwideMeta()
                metadata['dropship_fee'] = unicode(row['BI UK Dropship Fee'],
                                                   errors='ignore')
                metadata['est_tax'] = unicode(row['BI UK Est Tax'],
                                              errors='ignore')
                metadata['ship_weight'] = unicode(row['BI UK ship Wt'],
                                                  errors='ignore')
                metadata['product_group'] = unicode(row['BI UK Product Grp'],
                                                    errors='ignore')
                metadata['upc'] = unicode(row['BI UK UPC #'], errors='ignore')
                metadata['mpn'] = unicode(row['BI UK Model'], errors='ignore')
                product['metadata'] = metadata
                yield product

Exemple #28

0

Afficher le fichier

Fichier : cyberport.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip()
        loader.add_value('name', name)
        identifier = hxs.select('//div[@id="article_cont"]//div[@class="itemcostinfo"]/@itemid').extract()[0]
        loader.add_value('identifier', identifier)

        sku = response.meta.get('sku', '')
        if sku:
            loader.add_value('sku', sku)
            loader.add_value('brand', response.meta.get('brand', ''))
        else:
            sku = hxs.select('//span[@id="hbNrDataSheet"]/text()').extract()
            if sku:
                loader.add_value('sku', sku[0])
            loader.add_value('brand', 'Logitech')

        loader.add_value('url', response.url)
        image_url = hxs.select('//ul[@id="gliderSmall"]/li/a/@onclick').extract()
        if image_url:
            match = re.search(r"imgSrc: '(.*?)',", image_url[0])
            if match:
                image_url = match.group(1)
                loader.add_value('image_url', urljoin_rfc(base_url, image_url))
        else:
            image_url = hxs.select('//img[@id="itemDetail{}"]/@src'.format(identifier)).extract()
            if image_url:
                loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        price = hxs.select('//meta[@itemprop="price"]/@content').extract()
        price = extract_price(price[0])
        loader.add_value('price', price)
        in_stock = hxs.select('//meta[@itemprop="availability"]/@content').extract()[0].strip()
        if in_stock == 'out_of_stock':
            loader.add_value('stock', 0)
        category = hxs.select('//*[@id="main-centercontainer"]/div[1]/a[3]/text()').extract()
        if category:
            loader.add_value('category', category[0])
        shipping = '0.0'
        shipping_base = hxs.select('//div[@class="clear fl b8"]/span[@class="basis fl"]/text()').extract()
        if shipping_base:
            shipping = shipping_base[0].replace(',', '')
            shipping_decimal = hxs.select('//div[@class="clear fl b8"]/span[@class="decimal fl"]/text()').extract()
            if shipping_decimal:
                shipping += '.' + shipping_decimal[0]
        loader.add_value('shipping_cost', extract_price(shipping))
        yield loader.load_item()

Exemple #29

0

Afficher le fichier

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     product = response.meta['product']
     forms = hxs.select('//form[contains(@action, "BuyPrintNGo")]')
     image_url = hxs.select(
         '//div[@id="imagegallerymain"]/div//img/@src').extract()
     if image_url:
         product['image_url'] = urljoin_rfc(base_url, image_url[0])
     summary = ' '.join(
         map(unicode.strip,
             hxs.select('//*[@id="summary"]//text()').extract())).strip()
     if not summary:
         summary = ''.join(
             map(
                 unicode.strip,
                 hxs.select(
                     '//div[@id="accordion"]//div[contains(@class, "panel-heading") and contains(h4/a/text(), "Summary")]/following-sibling::div//div[contains(@class, "panel-body")]/p/text()'
                 ).extract())).strip()
     if not summary:
         summary = ''.join(
             map(
                 unicode.strip,
                 hxs.select(
                     '//div[@id="accordion"]//div[contains(@class, "panel-heading") and contains(h4/a/text(), "Summary")]/following-sibling::div//div[contains(@class, "panel-body")]//text()'
                 ).extract())).strip()
     if not summary:
         summary = ''.join(
             hxs.select(
                 '//*[@id="fine_print"]//*[@class="info_section"]//text()').
             extract()).strip()
     if 'metadata' in product:
         product['metadata']['summary'] = summary
     else:
         metadata = {}
         metadata['summary'] = summary
         product['metadata'] = metadata
     if not forms:
         yield product
     else:
         for form in forms:
             identifier = form.select(
                 './/input[@name="productCode"]/@value').extract().pop()
             if product['identifier'] != identifier:
                 continue
             for line in form.select('.//table/tr'):
                 item = deepcopy(product)
                 title = line.select('./td[@class="name"]/text()').extract()
                 variant_id = line.select(
                     './/input[contains(@name, "hdnVariantName_")]/@value'
                 ).extract()
                 price = line.select(
                     './/input[contains(@name, "RP_")]/@value').extract()
                 if title and variant_id and price:
                     item = deepcopy(product)
                     item['name'] = "%s - %s" % (item['name'], title[0])
                     item['identifier'] = "%s-%s" % (identifier,
                                                     variant_id[0])
                     item['price'] = extract_price(price[0])
                     yield item

Exemple #30

0

Afficher le fichier

    def _parse_product_el(self, product_el, base_url):
        name = product_el.select(
            'a/div[@class="prd_details"]/h2/text()').extract()[0].strip()
        price = product_el.select(
            'a/div[@class="prd_details"]/div[@class="prd_price_area"]/span/text()'
        ).extract()[0].strip()
        price = extract_price(price)
        url = product_el.select("a/@href").extract()[0]
        product_code = extract_product_code_from_url(url).lower()
        image_url = product_el.select(
            'a/span[@class="prd_img"]/img/@data-original').extract()[0]

        loader = ProductLoader(selector=product_el, item=Product())
        loader.add_value('identifier', product_code)
        loader.add_value('sku', product_code)
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('url', url)
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url))
        category = category_codes.get(product_code[0:3])
        loader.add_value('category', category)
        metadata = BuyAGiftMeta()
        if product_code in self.suppliers:

            supplier_list = self.suppliers[product_code]['suppliers']
            metadata['supplier_name'] = ', '.join(supplier_list)
        product = loader.load_item()
        product['metadata'] = metadata
        self.collected.add(product_code)
        return product

Exemple #31

0

Afficher le fichier

Fichier : richersoundscom.py Projet : 0--key/lib

    def parse_product_clearance(self, response):
        URL_BASE = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select("//div[contains(@class, 'prodDetails')]/h2/text()").extract()
        if not name:
            logging.error("ERROR!! NO NAME PRODUCT PAGE!! %s" % (response.url, ))
            return
        name = " ".join([x.strip() for x in name])

        url = response.url

        price = hxs.select("//div[@class='pricing']/h4/text()").extract()
        if not price:
            logging.error("ERROR!! NO PRICE PRODUCT PAGE!! %s %s" % (response.url, name))
            return
        price = extract_price(price[0].strip())
        if not price:
            logging.error("ERROR!! NO PRICE PRODUCT PAGE!! %s %s" % (response.url, name))
            return

        product = Product()
        loader = ProductLoader(item=product, response=response)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)
        yield loader.load_item()

Exemple #32

0

Afficher le fichier

Fichier : instrumartspider.py Projet : 0--key/lib

    def parse_options(self, response):
        loader = response.meta['loader']
        seen = []
        url = loader.get_output_value('url')
        main_name = loader.get_output_value('name')
        options_data = json.loads(response.body)
        base_price = extract_price(str(options_data['startingPrice']))
        if base_price:
            p = ProductLoader(response=response, item=Product())
            p.add_value('url', url)
            p.add_value('name', main_name)
            p.add_value('price', base_price)
            yield p.load_item()

        for option in options_data['options']:
            for value in option['values']:
                if value.get('cost') and not value['description'] in seen:
                    seen.append(value['description'])
                    p = ProductLoader(response=response, item=Product())
                    p.add_value('url', url)
                    p.add_value('name', main_name + ' ' + value['description'])
                    p.add_value('price', base_price + extract_price(str(value['cost'])))
                    yield p.load_item()

Exemple #33

0

Afficher le fichier

Fichier : ckitchen.py Projet : 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        main_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0].strip()
        main_price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if not main_price:
            main_price = hxs.select('//input[@name="ppi"]/@value').extract()

        main_price = extract_price(main_price[0])

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('name', main_name)
        loader.add_value('price', main_price)
        loader.add_value('url', response.url)
        loader.add_xpath('sku', '//span[@itemprop="identifier"]/text()')
        yield loader.load_item()


        def _add_options(option_sets, current_name, current_price):
            if not option_sets and current_price > main_price:
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('url', response.url)
                loader.add_value('name', current_name)
                loader.add_value('price', current_price)
                yield loader.load_item()
            else:
                options = option_sets[0]
                option_sets = option_sets[1:]
                for option in options.select('./option/text()').extract():
                    r = re.search('(.*)\(Add(.*)\)', option)
                    name = current_name
                    price = current_price
                    if r:
                        name += ' ' + r.groups()[0].strip()
                        price += extract_price(r.groups()[1])
                    else:
                        name += ' ' + option

                    for product in _add_options(option_sets, name, price):
                        yield product

        option_sets = hxs.select('//div[@class="inn"]/select')
        if option_sets:
            for product in _add_options(option_sets, main_name, main_price):
                yield product

Exemple #34

0

Afficher le fichier

Fichier : ckitchen.py Projet : 0--key/lib

        def _add_options(option_sets, current_name, current_price):
            if not option_sets and current_price > main_price:
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('url', response.url)
                loader.add_value('name', current_name)
                loader.add_value('price', current_price)
                yield loader.load_item()
            else:
                options = option_sets[0]
                option_sets = option_sets[1:]
                for option in options.select('./option/text()').extract():
                    r = re.search('(.*)\(Add(.*)\)', option)
                    name = current_name
                    price = current_price
                    if r:
                        name += ' ' + r.groups()[0].strip()
                        price += extract_price(r.groups()[1])
                    else:
                        name += ' ' + option

                    for product in _add_options(option_sets, name, price):
                        yield product