コード例 #1
0
ファイル: toolstop.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        loader.add_value('url', response.url)
        loader.add_xpath('brand', '//div[@id="brandlink"]//img/@alt')
        loader.add_xpath('sku', '//span[@class="barcode"]/text()')
        if not loader.get_output_value('sku'):
            loader.add_xpath('sku', '//meta[@itemprop="gtin13"]/@content')

        loader.add_value('identifier', response.url.split('p')[-1])

        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()[0]
        loader.add_value('image_url', urljoin_rfc(base_url, image_url))
        price = loader.get_output_value('price')
        if price < Decimal(25):
            loader.add_value('shipping_cost', '6.95')
        else:
            loader.add_value('shipping_cost', '0')

        categories = hxs.select('//ul[@id="breadcrumb"]//a/text()').extract()
        categories = [
            x.strip() for x in categories if x.lower().strip() != 'home'
        ][:3]
        loader.add_value('category', categories)

        if not hxs.select('//h4[@class="product_instock"]') and not hxs.select(
                '//button[@class="buynow"]'):
            loader.add_value('stock', 0)

        yield loader.load_item()
コード例 #2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="CB_box_prodview"]//h2/text()')
        loader.add_value('url', response.url)
        price = ''.join(
            hxs.select('//div[@class="viewprod_price"]//text()').extract())
        loader.add_value('price', price)
        loader.add_xpath('sku',
                         '//div[@class="viewprod_right"]//div/text()',
                         re='Barcode: (.*)')

        log.msg(loader.get_output_value('sku'))
        log.msg(response.meta['sku'])

        if loader.get_output_value('sku') == response.meta['sku']:
            yield loader.load_item()
        else:
            prods = response.meta['products']
            if prods:
                yield Request(urljoin_rfc(get_base_url(response), prods[0]),
                              callback=self.parse_product,
                              meta={
                                  'sku': response.meta['sku'],
                                  'products': prods[1:]
                              })
コード例 #3
0
    def parse_product(response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('identifier', '//input[@name="sku"]/@value')
        loader.add_xpath(
            'name',
            '//div[attribute::id="cat-product-detail-info"]/h1[1]/text()')
        loader.add_value('brand', 'Lego')
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//*[@id="cat-prod-det-reg-price"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath(
                'price',
                '//*[@id="cat-product-details-sale-price"]/span/text()')
            if not loader.get_output_value('price'):
                return
        image_url = hxs.select(
            '//div[attribute::id="cat-product-detail-img"]/div[1]/a[1]/img[1]/@src'
        ).extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        for category in hxs.select(
                '//div[@id="cat-product-detail"]/div[@id="bc"]/div[@class="fl"]/a/text()'
        )[1:].extract():
            loader.add_value('category', category)
        yield loader.load_item()
コード例 #4
0
ファイル: applejack_crawler.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('sku', re.search('product/(\d+)', response.url).groups())
        name = hxs.select(u'//h1[@class="pagetitle"]/text()').extract()[0].strip()
        bottle_size = hxs.select(u'//div[child::strong[contains(text(), "Bottle Size") or contains(text(), "Size of Bottle")]]/span/text()')
        if not bottle_size:
            bottle_size = hxs.select(u'//div[contains(text(),"Size of Bottle")]/span/text()')
        name += ' ' + bottle_size.extract()[0].strip()
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="cardPrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="salePrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="regularPrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="regularprice"]/text()')
        site_sku = hxs.select(u'//span[@class="itemnumber"]/text()').re(u'- (.*)')[0].strip()
        search_sku = response.meta['sku'].strip()
        if site_sku == search_sku:
            yield loader.load_item()
コード例 #5
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath(
                'name',
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()'
            )
            #if not accept_product(loader.get_output_value('name')):
            #    continue
            loader.add_xpath(
                'url',
                './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href'
            )
            loader.add_xpath('price', './/*[@class="newPrice"]//span/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            #loader.add_value('sku', response.meta['sku'])
            #loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
               valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()
コード例 #6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        search_results = []
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h3/a/span/text()')
            if not loader.get_output_value('name'):
                loader.add_xpath('name', './/h3/a/text()')
            loader.add_xpath('url', './/h3/a/@href')
            loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)')
            if not loader.get_output_value('price'):
                loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'].lower())
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')):
                pr = loader
                search_results.append(pr)

        # if pr:
            # yield pr.load_item()
        if search_results:
            cur_prod = search_results[0]
            next_prods = search_results[1:]
            yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids,
                          meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)
コード例 #7
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())

        loader.add_value('url', response.url)
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_xpath('sku', '//span[@itemprop="mpn"]/text()')
        loader.add_xpath('name', '//h1/span[@itemprop="name"]/text()')

        price = hxs.select(
            '//form//p[@class="special-price"]/span[@class="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//form//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        if not price:
            price = hxs.select(
                '//meta[@property="og:price:amount"]/@content').extract()

        price = price[0] if price else 0
        loader.add_value('price', price)

        categories = hxs.select(
            '//div[@class="breadcrumbs"]//li[not(@class="home")]/a/text()'
        ).extract()
        loader.add_value('category', categories)

        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        brand = hxs.select('//a[@class="brand-link"]/text()').re(
            'View All (.*) Prod')
        loader.add_value('brand', brand)

        out_of_stock = hxs.select(
            '//form//p[@class="availability out-of-stock"]')
        if out_of_stock or not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', 2.95)

        item = loader.load_item()

        product_swatches = hxs.select('//div[@class="product-swatches"]')
        options = hxs.select(
            '//select[contains(@class, "bundle-option")]/option')
        if options and not product_swatches:
            for option in options:
                option_item = deepcopy(item)
                option_item['identifier'] += '-' + option.select(
                    '@value').extract()[0]
                option_item['name'] += ' ' + option.select(
                    'text()').extract()[0].split(' - ')[0]
                yield option_item
        else:
            yield item
コード例 #8
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('sku',
                         re.search('product/(\d+)', response.url).groups())
        name = hxs.select(
            u'//h1[@class="pagetitle"]/text()').extract()[0].strip()
        bottle_size = hxs.select(
            u'//div[child::strong[contains(text(), "Bottle Size") or contains(text(), "Size of Bottle")]]/span/text()'
        )
        if not bottle_size:
            bottle_size = hxs.select(
                u'//div[contains(text(),"Size of Bottle")]/span/text()')
        name += ' ' + bottle_size.extract()[0].strip()
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="cardPrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="salePrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="regularPrice"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="regularprice"]/text()')
        site_sku = hxs.select(u'//span[@class="itemnumber"]/text()').re(
            u'- (.*)')[0].strip()
        search_sku = response.meta['sku'].strip()
        if site_sku == search_sku:
            yield loader.load_item()
コード例 #9
0
ファイル: amazon_spider.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()')
            # if not accept_product(loader.get_output_value('name')):
            #    continue
            loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href')
            loader.add_xpath("price", './/*[@class="newPrice"]//span/text()')
            loader.add_value("sku", response.meta["sku"])
            loader.add_value("identifier", response.meta["sku"])
            # loader.add_value('sku', response.meta['sku'])
            # loader.add_value('identifier', response.meta['sku'])
            if (
                loader.get_output_value("price")
                and (pr is None or pr.get_output_value("price") > loader.get_output_value("price"))
                and valid_price(response.meta["price"], loader.get_output_value("price"))
            ):
                pr = loader

        if pr:
            yield pr.load_item()
コード例 #10
0
    def parse_node(self, response, node):
        if not isinstance(response, XmlResponse):
            return

        identifier = node.select(u'./product-url/text()').re(
            r'product/([^/]+)/')
        identifier = identifier[0]

        loader = ProductLoader(item=Product(), selector=node)
        url = node.select(u'./product-url/text()').extract()[0]
        loader.add_value('url', url)
        loader.add_xpath('name', u'./title/text()')
        price = node.select(u'./price/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        loader.add_xpath('category', u'merchant-category/text()')
        loader.add_xpath('image_url', u'image-url/text()')
        loader.add_value('sku', identifier)
        loader.add_value('identifier', identifier)
        if loader.get_output_value('price') > 399:
            loader.add_value('shipping_cost', '0')
        else:
            loader.add_value('shipping_cost', '25')
        if loader.get_output_value('price'):
            return loader.load_item()
        else:
            return Product()
コード例 #11
0
ファイル: fonq_nl.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = re.findall("product_id = '(\d+)'", response.body)[0]
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@class="page-header"]/h1/text()')
        price = ''.join(
            hxs.select(
                '//div[@class="price price-large"]/div[@class="price"]/span[@itemprop="price"]/text()'
            ).extract())
        loader.add_value('price', extract_price_eu(price))
        loader.add_xpath(
            'sku',
            '//tr/td[contains(strong/text(), "Bestelcode")]/../td[2]/text()')
        loader.add_value('category', 'Lego')

        img = hxs.select(
            '//div[@id="productgallery-image-display"]//img/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        if loader.get_output_value('price'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        if loader.get_output_value('price') < 20:
            loader.add_value('shipping_cost', 2.95)

        yield loader.load_item()
コード例 #12
0
ファイル: japanese-koi.py プロジェクト: ontiyonke/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0]
        base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract()
        if not base_price:
            base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract()
        base_price = base_price[0]
        product_options = hxs.select(u'//ul[@class="options-list"]/li')
        if product_options:
            for option in product_options:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url)
                name_with_option = name + u' %s' % option.select(u'./span[@class="label"]/label/text()').extract()[0]
                loader.add_value('name', name_with_option)
                extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract()
                if extra_price:
                    extra_price = extra_price[0].replace(u'\xa3', u'')
                base_price = base_price.replace(u'\xa3', u'')
                loader.add_value('price', Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal('0.00')))
                if loader.get_output_value('price'):
                    yield loader.load_item()
        else:

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', base_price)
            if loader.get_output_value('price'):
                yield loader.load_item()
コード例 #13
0
    def parse_product(self, response):
        import re
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1//text()')
        loader.add_xpath('price', '//div[@class="buybutton"]//nobr//text()') 
        sku = ''.join(hxs.select('//td[contains(text(), "Artikelkod")]/text()').extract())
        try:
            loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0])
        except:
            self.log('No SKU for %s' % (response.url))
        loader.add_value('category', response.meta.get('category'))

        img = hxs.select('//meta[@property="og:image"]/@content').extract()
        if img:
            loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))
            loader.add_value('identifier', loader.get_output_value('image_url').split('/')[-1].split('-')[0])
        else:
            loader.add_value('identifier', loader.get_output_value('url').split('/')[-1])

        loader.add_value('brand', 'lego')
        if loader.get_output_value('price') > 1500:
            loader.add_value('shipping_cost', '0')
        else:
            loader.add_value('shipping_cost', '49')
        if hxs.select('//div[@class="buybutton" and @onclick]'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        yield loader.load_item()
コード例 #14
0
ファイル: angelfishaquatics.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-right"]//div[@class="pp-name"]/h1/text()').extract()[0].strip()
        main_price = hxs.select(u'//div[@class="product-right"]//div[@class="pp-price"]/span/span/text()').extract()[0]
        product_options = hxs.select(u'//select[@class="ekm-productoptions-dropdown-option"]')
        if product_options:
            body = response.body.replace('\xc2', ' ')
            if product_options.select(u'../select[@onchange]'):
                set_option_price = True
            for option in product_options.select(u'./option'):
                name_with_option = name + u' %s' % option.select(u'./text()').extract()[0].strip()
                option_value = option.select(u'./@value').extract()[0]
                price = re.search('== \'%s\'.*?_EKM_PRODUCTPRICE.*?= \'([\d\.]+?)\'' % option_value, body, re.DOTALL).groups()[0]\
                        if set_option_price else main_price

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', name_with_option)
                loader.add_value('price', price)
                loader.add_value('url', response.url)
                if loader.get_output_value('price'):
                    yield loader.load_item()
        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', main_price)
            if loader.get_output_value('price'):
                yield loader.load_item()
コード例 #15
0
ファイル: lego.py プロジェクト: oceancloud82/scraping
    def parse(self, response):
        old_prices = {}
        reader = csv.DictReader(StringIO(response.body))
        for row in reader:
            old_prices[row['Product No.']] = row

        with open(os.path.join(HERE, 'legodk_products.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('sku', row['Product No.'])
                loader.add_value('category', row['Theme'])
                loader.add_value('brand', 'LEGO')
                loader.add_value(
                    'name', row['Item Description English'].decode('utf8'))
                price = row.get('RRP price DKK')
                if not price:
                    old_product = old_prices.get(row['Product No.'])
                    price = '0.0'
                    identifier = row['Item no'].lower()
                    if old_product:
                        price = old_product.get('RRP price DKK')
                        identifier = old_product['Item no'].lower()

                    loader.add_value('price', price)
                    loader.add_value('identifier', identifier)
                else:
                    loader.add_value('price', price)
                    loader.add_value('identifier', row['Item no'].lower())
                if not loader.get_output_value('identifier') in self.seen_ids:
                    self.seen_ids.add(loader.get_output_value('identifier'))
                    yield loader.load_item()
コード例 #16
0
ファイル: ebay.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//td[@r="1"]')
        if not product:
            product = hxs.select('//table[@r="1"]')

        if not product and response.meta.get('_retries', 0) >= 3:
            #log.msg('ALERT! ' + response.url)
            #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w')
            #f.write(response.body)
            #f.close()

            return
        elif not product:
            retries = response.meta.get('_retries', 0)
            yield Request(response.url, meta={'sku': response.meta['sku'],
                                              '_retries': retries + 1},
                                              dont_filter=True)
            return

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()')
        loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()')
        loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()')
        loader.add_xpath('price', './/*[@itemprop="price"]/text()')
        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'])

        if not 'apparelsave' in loader.get_output_value('name').lower() \
           and valid_price(response.meta['price'], loader.get_output_value('price')):
            yield loader.load_item()
コード例 #17
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)

        loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()')

        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')

        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'].lower())
        yield loader.load_item()
コード例 #18
0
ファイル: portonaquapet.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip()
        multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option')
        if multiple_options and not u'requested' in response.meta:
            for option in multiple_options:
                formname = u'aspNetForm'
                formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0],
                            u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList',
                            u'__EVENTARGUMENT' : u''}
                req = FormRequest.from_response(response, formname=formname,
                                                    formdata=formdata,
                                                    meta={u'requested': True},
                                                    dont_click=True, callback=self.parse_product)
                yield req
        if multiple_options:
            name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
コード例 #19
0
ファイル: fragrancenet.py プロジェクト: 0--key/lib
    def parse_products(self, hxs, response):
        products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]')

        for p in products:
            loader = ProductLoader(item=Product(), selector=p)

            name = p.select('.//p[@class="para1"]//text()').extract()
            name = ' '.join([n.strip() for n in name])
            name = re.sub(' +', ' ', name)

            loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href')
            loader.add_value('name', name)
            loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)')
            loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)')

            if not loader.get_output_value('price'):
                yield Request(loader.get_output_value('url'), callback=self.parse_products2)
                continue



            if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\
               and response.meta.get('ret', 0) < 3:

                yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1})
                return

            yield loader.load_item()
コード例 #20
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = hxs.select('//h1[@itemprop="name"]/@id').re(
            "product_name_([0-9]+)")
        if identifier:
            identifier = identifier[0]
        else:
            log.msg('Product without identifier: ' + response.url)
            return

        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        price = hxs.select(
            '//div[@id="productdetail"]/div/span/meta[@itemprop="price"]/@content'
        ).extract().pop()
        price = extract_price(price)

        loader.add_value('price', price)
        try:
            loader.add_value(
                'sku',
                re.findall('(\d+)', loader.get_output_value('name'))[-1])
        except:
            pass
        loader.add_xpath(
            'category',
            '//div[@id="widget_breadcrumb"]/ul/li[last() - 1]/a/text()')
        loader.add_xpath('image_url', '//a[@id="PD_image_zoom"]/@href')
        loader.add_value('brand', 'lego')
        if loader.get_output_value('identifier'):
            yield loader.load_item()
コード例 #21
0
ファイル: amazon_spider.py プロジェクト: ontiyonke/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value(
                'name',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('span')[0].string)
            loader.add_value(
                'url',
                soup.find('h3', attrs={
                    'class': 'newaps'
                }).findAll('a')[0]['href'])
            loader.add_value(
                'price',
                soup.find('ul', attrs={
                    'class': 'rsltL'
                }).findAll('span')[0].string)
            #loader.add_value('sku', response.meta['sku'])
            #loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')) and \
               valid_price(response.meta['price'], loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()
コード例 #22
0
ファイル: testequipmentdepot.py プロジェクト: 0--key/lib
    def parse_products(self, hxs, response):
        print response.encoding
        model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                               ' and text()="Model"]/preceding-sibling::*) + 1').extract()
        description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                     ' and text()="Description"]/preceding-sibling::*) + 1').extract()
        price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                ' and text()="Price"]/preceding-sibling::*) + 1').extract()

        if model_pos and description_pos and price_pos:
            model_pos = model_pos[0].split('.')[0]
            description_pos = description_pos[0].split('.')[0]
            price_pos = price_pos[0].split('.')[0]

            products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \
                                   and not(text()="Model")]/..' % model_pos)
            for product in products:
                loader = ProductLoader(selector=product, item=Product())
                url = response.url
                model_url = product.select('.//td[starts-with(@class, "orderinfo") \
                                            and position()=%s]//a/@href' % model_pos).extract()
                if model_url:
                    url = urljoin_rfc(get_base_url(response), model_url[0])

                loader.add_value('url', url)
                loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos)
                loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos)
                if not loader.get_output_value('price') or not loader.get_output_value('name').strip():
                    continue

                yield loader.load_item()
コード例 #23
0
ファイル: japanese-koi.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0]
        base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract()
        if not base_price:
            base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract()
        base_price = base_price[0]
        product_options = hxs.select(u'//ul[@class="options-list"]/li')
        if product_options:
            for option in product_options:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value("url", response.url)
                name_with_option = name + u" %s" % option.select(u'./span[@class="label"]/label/text()').extract()[0]
                loader.add_value("name", name_with_option)
                extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract()
                if extra_price:
                    extra_price = extra_price[0].replace(u"\xa3", u"")
                base_price = base_price.replace(u"\xa3", u"")
                loader.add_value(
                    "price", Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal("0.00"))
                )
                if loader.get_output_value("price"):
                    yield loader.load_item()
        else:

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value("url", response.url)
            loader.add_value("name", name)
            loader.add_value("price", base_price)
            if loader.get_output_value("price"):
                yield loader.load_item()
コード例 #24
0
ファイル: 6pm.py プロジェクト: oceancloud82/scraping
 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     product = hxs.select('//div[@class="product-page"]')
     if product:
         loader = ProductLoader(item=Product(), selector=product)
         name = product.select(
             './/h1[@class="main-heading standard-header"]/a/text()'
         ).extract()
         name2 = product.select(
             './/h1[@class="main-heading standard-header"]/text()').extract(
             )
         if name:
             price = "".join(
                 product.select('.//span[@id="price"]/text()').re(
                     r'([0-9\,\. ]+)')).strip()
             loader.add_value('name',
                              name[0].strip() + ' ' + name2[0].strip())
             loader.add_value('url', response.url)
             loader.add_value('price', price)
             loader.add_value('sku', response.meta['sku'])
             if not 'apparelsave' in loader.get_output_value(
                     'name').lower():
                 yield loader.load_item()
     else:
         products = hxs.select('.//div[@id="searchResults"]/a')
         if products:
             for product in products:
                 name = product.select(
                     './span[@class="brandName"]/text()').extract()
                 name2 = product.select(
                     './span[@class="productName"]/text()').extract()
                 if name and name2:
                     product_name = name[0].strip() + ' ' + name2[0].strip()
                     product_words = product_name.lower().strip().split(' ')
                     search_words = response.meta['name'].lower().replace(
                         '+', ' ').split(' ')
                     diff = [
                         w for w in search_words if not w in product_words
                     ]
                     if not diff:
                         price = "".join(
                             product.select(
                                 './span[@class="price-6pm"]/text()').re(
                                     r'([0-9\,\. ]+)')).strip()
                         loader = ProductLoader(item=Product(),
                                                selector=product)
                         loader.add_value('name', product_name)
                         loader.add_value(
                             'url',
                             urljoin_rfc(
                                 base_url,
                                 product.select('.//@href').extract()[0]))
                         loader.add_value('price', price)
                         loader.add_value('sku', response.meta['sku'])
                         if not 'apparelsave' in loader.get_output_value(
                                 'name').lower():
                             yield loader.load_item()
                             break
コード例 #25
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('sku', '//input[@id="txtEMLNEID"]/@value')
        loader.add_value('identifier', ':'.join(hxs.select('//input[@id="txtEMLNEID"]/@value').extract() + hxs.select('//input[@id="txtEMSZEID"]/@value').extract()))
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        loader.add_xpath('price', '//div[@id="net"]/text()')
        loader.add_xpath('category', '//div[@id="bread_crumb"]/a[3]/text()')

        img = hxs.select('//img[@id="product_image"]/@src').extract()
        if img:
            loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))

        brand = ''.join(hxs.select('//img[contains(@src, "/brands/")]/@src').extract())
        loader.add_value('brand', brand.split('/')[-1].split('.')[0].replace('-', ' '))

        if loader.get_output_value('price'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        size = ''.join(hxs.select('normalize-space(//select[@onchange="jump(this.value)"]//option[@selected="selected"]/text())').extract())
        loader.add_value('name', '-'.join(size.split('-')[:-1]))

        price_adj = {}
        for cfg in re.findall('adjustment\((\d+),(\d+),([\d.,]+)\)', response.body):
            price_adj[(cfg[0], cfg[1])] = float(cfg[2])

        # Include only options that change the price
        opt_groups = []
        for sel in hxs.select('//select[@onchange!="jump(this.value)" and @id!="quantity"]'):
            try: id = sel.select('./@id').re('\d+')[0]
            except: continue
            opts = []
            for opt in sel.select('.//option'):

                value = opt.select('./@value').extract()[0]
                text = opt.select('normalize-space(./text())').extract()[0]
                if (id, value) in price_adj and float(price_adj[id, value]) != 0.0:
                    opts.append((price_adj[id, value], text, value))
            if opts:
                opt_groups.append(opts)

        prod = loader.load_item()
        if prod.get('identifier'):
            if response.url in self.SKIP_OPTIONS or not loader.get_output_value('price'):
                yield prod
            else:
                yield prod
                for opt_price, opt_name, opt_id in multiply(opt_groups):
                    p = Product(prod)
                    p['name'] = p['name'] + ' ' + opt_name
                    p['price'] = p['price'] + Decimal(opt_price).quantize(Decimal('1.00'))
                    p['identifier'] = p['identifier'] + ':' + opt_id if opt_id else p['identifier'] + '-'
                    yield p

        for url in hxs.select('//select[@onchange="jump(this.value)"]//option/@value').extract():
            yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product)
コード例 #26
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@class="pro-name"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand'))
        categories = hxs.select(
            '//div[contains(@class,"breadcrumbs")]/a/text()').extract()
        for category in categories[2:]:
            loader.add_value('category', category)

        identifier = hxs.select(
            './/input[@type="hidden" and @name="product_id"]/@value'
        )[0].extract()
        loader.add_value('identifier', identifier)
        # sku = hxs.select('').extract()
        # loader.add_value('sku', sku)

        image_url = hxs.select('//div[@class="image"]/a/@href').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        loader.add_value('shipping_cost', '0.00')

        price = hxs.select(
            '//div[@class="total-price"]/span[@class="price-total"]/text()'
        ).extract()
        loader.add_value('price', price)

        self.log(loader.get_output_value('price'))
        if Decimal(loader.get_output_value('price')) < Decimal('30.00'):
            loader.add_value('shipping_cost', '2.99')

        stock = hxs.select(
            './/div[@class="stock-level"]/span[contains(text(),"In Stock")]')
        if not stock:
            loader.add_value('stock', 0)

        item = loader.load_item()
        options = hxs.select(
            '//div[@class="options"]/div/select/option[not(contains(text(),"Select"))]'
        )
        for option in options:
            option_name = option.select('./text()')[0].extract().strip()
            option_item = deepcopy(item)
            option_item['identifier'] = '{}-{}'.format(
                identifier,
                option.select('./@value')[0].extract())
            option_item['name'] += ' ' + option_name
            yield option_item
        else:
            yield item
コード例 #27
0
ファイル: aolcookshop.py プロジェクト: oceancloud82/scraping
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        for url in hxs.select(
                '//div[@class="product_list"]//a/@href').extract():
            yield Request(urljoin_rfc(get_base_url(response), url))

        if not hxs.select('//span[@class="product"]/h1/text()'):
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//span[@class="product"]/h1/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', 'Le Creuset')
        loader.add_xpath(
            'category',
            '//div[@class="text_breadcrumbs"]/a[position()>1]//text()')
        loader.add_xpath(
            'sku',
            'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")'
        )
        loader.add_xpath(
            'identifier',
            'substring-after(//font[@size="1" and contains(text(), "Ref:")]/text(), ": ")'
        )
        image_url = hxs.select('//img[@class="fullimage1"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))
        loader.add_xpath('price',
                         '//h3[@class="product_price"]/prices/span[2]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', '//h3[@class="product_price"]//text()')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '4.95')
        else:
            loader.add_value('shipping_cost', '0')

        if hxs.select(
                '//div[@class="stock-message"]/span[contains(.//text(), "In stock") or contains(.//text(), "plenty of stock in")]'
        ):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        item = loader.load_item()
        metadata = LeCreusetMeta()
        metadata['promotion'] = ''.join(
            hxs.select(
                '//div[@class="special-offer-message"]/span/text()').extract())
        item['metadata'] = metadata

        yield item
コード例 #28
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        i = 0
        for product in products:
            i += 1
            product_loader = ProductLoader(item=Product(), selector=product)

            name = product.select(
                './/h3[@class="newaps"]/a/span/text()').extract()
            if not name:
                if i == 1:
                    self.log("ERROR name not found")
                continue

            product_loader.add_value('name', name[0])

            price = product.select(
                './/ul[@class="rsltL"]//span[1]/text()').extract()

            if not price:
                price = product.select(
                    './/ul[contains(@class,"rsltGridList grey")]//span[1]/text()'
                ).extract()
                if not price:
                    self.log("ERROR price not found2")
                    continue

            product_loader.add_value('price', price[0])

            url = product.select('.//h3[@class="newaps"]/a/@href').extract()

            if not url:
                self.log("ERROR url not found")
            else:
                product_loader.add_value('url', url[0])

            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier', response.meta['sku'])

            #self.log("price: " + str(product_loader.get_output_value('price')) + ", price_meta: " + str(response.meta['price']) + ", url: " + response.url)

            if product_loader.get_output_value('price') and \
                (pr is None or pr.get_output_value('price') > product_loader.get_output_value('price')) and \
                valid_price(response.meta['price'], product_loader.get_output_value('price')):
                pr = product_loader

        if pr:
            yield pr.load_item()
コード例 #29
0
ファイル: gitarhuset.py プロジェクト: 0--key/lib
    def parse_node(self, response, node):
        if not isinstance(response, XmlResponse):
            return

        loader = ProductLoader(item=Product(), selector=node)
        loader.add_xpath('url', u'./product-url/text()')
        loader.add_xpath('name', u'./title/text()')
        price = node.select(u'./price/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        log.msg(json.dumps({'name': loader.get_output_value('name'), 'price': price}))
        if loader.get_output_value('price'):
            return loader.load_item()
        else:
            return Product()
コード例 #30
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1[contains(@class,"fpProdutTitle")]/text()')
        price = hxs.select(u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceXL")]/text()').extract()
        if price:
            price = price[0] + '.' + hxs.select(u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceXL")]/sup/text()').re(u'(\d+)')[0]
            product_loader.add_value('price', price)
        if product_loader.get_output_value('name') and product_loader.get_output_value('price'):
            yield product_loader.load_item()
コード例 #31
0
ファイル: fun.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(selector=hxs, item=Product())

        name = hxs.select(
            '//div[@class="product-name"]/span/text()').extract()[0].strip()
        loader.add_value('name', name)
        loader.add_value('url', response.url)

        price = hxs.select(
            '//div[@class="buy-container"]//p[@class="special-price"]/span[@class="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//div[@class="buy-container"]//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        price = price[0] if price else 0

        loader.add_value('price', extract_price(price))

        img_url = hxs.select('//img[@id="image-0"]/@src').extract()
        if img_url:
            loader.add_value('image_url', urljoin(base_url, img_url[0]))

        loader.add_xpath(
            'category', '//li[span/text()="Thema"]/span[@class="data"]/text()')
        loader.add_value('brand', 'Lego')

        identifier = hxs.select('//input[@name="product"]/@value').extract()
        if not identifier:
            log.msg('ERROR >>> Product without identifier: ' + response.url)
            return
        loader.add_value('identifier', identifier[0])

        loader.add_xpath(
            'sku',
            '//li[span/text()="Artikelnummer"]/span[@class="data"]/text()')

        out_of_stock = hxs.select('//span[@class="out-of-stock-msg"]')
        if out_of_stock or loader.get_output_value('price') <= 0:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', 2.99)

        yield loader.load_item()
コード例 #32
0
ファイル: shoebacca.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//ul[@id="finder-data"]/li')

        if not products:
            return
        product = products[0]

        loader = ProductLoader(item=Product(), selector=product)
        name = "".join(product.select('./a/div/h5/span/text()').extract())
        if name:
            name2 = "".join(product.select('./a/div/h5/text()').extract())
            url = product.select('./a/@href').extract()[0]
            price = "".join(product.select('./a/div[@class="p-price"]/text()').re(r'([0-9\,\. ]+)')).strip()
            if not price:
                price = "".join(product.select('./a/div[@class="p-price"]/span[@class="sale-price"]/text()').re(r'([0-9\,\. ]+)')).strip()
            loader.add_value('name', name.strip() + ' ' + name2.strip())
            loader.add_value('url', urljoin_rfc(base_url,url))
            loader.add_value('price', price)
            loader.add_value('sku', response.meta['sku'])

            if not 'apparelsave' in loader.get_output_value('name').lower():
                yield loader.load_item()
コード例 #33
0
ファイル: outletbuy.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//table[@class="buybox"]')

        if not product:
            return

        loader = ProductLoader(item=Product(), selector=product)
        name = product.select('.//h1[@class="stylename"]/text()').extract()
        if name:
            log.msg(name[0].lower() + ' - ' + response.meta['name'].lower().replace('+', ' '))
            product_words = name[0].lower().strip().split(' ')
            search_words = response.meta['name'].lower().replace('+', ' ').split(' ')
            diff = [w for w in search_words if not w in product_words]
            #if name[0].lower() == response.meta['name'].lower().replace('+', ' '):
            if not diff:
                price = "".join(product.select('.//span[@class="price"]/span/text()').re(r'([0-9\,\. ]+)')).strip()
                loader.add_value('name', name[0])
                loader.add_value('url', response.url)
                loader.add_value('price', price)
                loader.add_value('sku', response.meta['sku'])

                if not 'apparelsave' in loader.get_output_value('name').lower():
                    yield loader.load_item()
コード例 #34
0
ファイル: shoesteal.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        products = hxs.select('//div[@class="productCellWrapper"]')
        if not products:
            return
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            brand = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="brand"]/text()').extract()).strip()
            style = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName color"]/text()').extract()).strip()
            name = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName name"]/text()').extract()).strip()
            name = brand + ' ' + name + ' ' + style
            product_words = name.lower().split(' ')
            search_words = response.meta['name'].lower().split()
            diff = [w for w in search_words if not w in product_words]
            if not diff:
                url = product.select('.//div[@class="productBrandTitleColor"]/a/@href').extract()[0]
                price = "".join(product.select('.//div[@class="price"]/span[@class="salePrice"]/text()').re(r'([0-9\,\. ]+)')).strip()
                if not price:
                    price = "".join(product.select('.//div[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip()
                loader.add_value('name', name)
                loader.add_value('url', urljoin_rfc(base_url,url))
                loader.add_value('price', price)
                loader.add_value('sku', response.meta['sku'])

                if not 'apparelsave' in loader.get_output_value('name').lower():
                    yield loader.load_item()
                    break
            """
コード例 #35
0
ファイル: vitalitymedical.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        base_url = get_base_url(response)
        search_sku = response.meta['sku']
        hxs = HtmlXPathSelector(response)

        main_name = hxs.select(u'//h3[@class="product-name"]/text()').extract()
        main_price = hxs.select(u'//div[@class="special_price"]//span[@class="price"]').extract()
        if not main_name and not main_price:
            return
        main_name = main_name[0].strip()
        if main_price:
            main_price = main_price[0].strip()
        subproducts = hxs.select(u'//table[@id="super-product-table"]//tr')[1:]
        subproducts += hxs.select(u'//table[@class="inner-table"]//tr')
        if subproducts:
            for p in subproducts:
                product_data = [s.strip() for s in p.select(u'.//td//text()').extract()[:-1] if s.strip() != '']
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url)
                loader.add_value('name', main_name + ' ' + ' '.join(product_data[0:-1]).strip())
                if product_data[-1].startswith('$'):
                    loader.add_value('price', product_data[-1])
                loader.add_value('sku', search_sku)

                sku = product_data[0]
                if sku in search_sku and loader.get_output_value('price'):
                    yield loader.load_item()
コード例 #36
0
ファイル: tomleemusic_ca.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1[@class="productDetailHeader"]/text()')
        if hxs.select(u'//span[@class="productDetailSelling"]/text()'):
            product_loader.add_xpath('price', u'//span[@class="productDetailSelling"]/text()')
        else:
            product_loader.add_value('price', '')
        product_loader.add_xpath('sku', u'//input[@type="hidden" and (@name="hidProductId" or @name="inv")]/@value')
        product_loader.add_xpath('category', u'//td[@class="smallPrint"]/a[position()=2 and contains(text(),"Products")]/../a[3]/text()')

        img = hxs.select(u'//a[@class="smallPrint" and @rel="lightbox"]/@href').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)
        if hxs.select(u'//a[contains(@href,"BrandName")]/@href'):
            product_loader.add_xpath('brand', u'substring-after(//a[contains(@href,"BrandName")]/@href,"=")')
        else:
            brands = hxs.select(u'//strong[@class="sideBarText"]/text()').extract()
            brands = [b.strip() for b in brands]
            for brand in brands:
                if product_loader.get_output_value('name').startswith(brand):
                    product_loader.add_value('brand', brand)
                    break
            else:
                product_loader.add_xpath('brand', u'normalize-space(substring-before(substring-after(//title/text(), " - "), " - "))')
#        product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()')

        yield product_loader.load_item()
コード例 #37
0
ファイル: visiondirect.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        soup = BeautifulSoup(response.body)
        try:
            price = soup.find('span', {'class': 'price ours'}).text
        except AttributeError:
            self.log('price not found {}'.format(response.url))
            return

        image_url = soup.find('img', itemprop='image')['src']
        identifier = soup.find('form', id='product_addtocart_form')
        identifier = identifier['action'].split('product/')[-1].split('/')[0]
        loader.add_value('image_url', image_url)
        loader.add_value('price', price)
        name = soup.find('h1', itemprop='name').text.strip()
        loader.add_value('name', name)
        loader.add_value('category', response.meta.get('category', ''))
        brand = soup.find('span', itemprop='manufacturer').text.replace('&nbsp;', '').split('by', 1)[1].strip()
        loader.add_value('brand', brand)
        loader.add_value('url', response.url)
        sku = soup.find('input', id='eye')
        loader.add_value('identifier', identifier)
        if sku:
            loader.add_value('sku', sku['value'])
        shipping_cost = '5.98'
        if loader.get_output_value('price') <= Decimal(59):
            shipping_cost = '9.98'
        loader.add_value('shipping_cost', shipping_cost)
        yield loader.load_item()
コード例 #38
0
    def parse_product(self, response):
        import re
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        pprice = hxs.select('//div[@class="price_bottom_bg"]/span[@class="fontBold125emR"]/text()').extract()
        if not pprice:
            pprice = hxs.select('//div[@class="price_bottom_bg"]//span[contains(@class, "prodPrcNowCatgLister")]/text()').extract()

        if pprice:
            price = extract_price_eu(pprice[0])
        else:
            self.errors.append('WARNING: No price in %s' % response.url)
            return

        loader.add_xpath('identifier', '//b[contains(text(), "SKU:")]/../text()')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@class="product-name"]/text()')
        loader.add_value('price', price)
        loader.add_xpath('sku', '//b[contains(text(), "Artikelnummer:")]/../text()')
        loader.add_value('category', response.meta.get('category'))

        img = hxs.select('//div[@id="product-view-media-main-image"]//img/@src').extract()
        if img:
            loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        if loader.get_output_value('price') > 20:
                loader.add_value('shipping_cost', '0')
#        loader.add_xpath('stock', '1')

        yield loader.load_item()
コード例 #39
0
    def parse_product(self, response):
        import re
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        loader.add_xpath('identifier',
                         'substring-after(//div[@class="code"]/text(), " ")')
        if not loader.get_output_value('identifier'):
            loader.add_xpath('identifier',
                             'substring-after(//*/@data-code, " ")')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        sku = ''.join(hxs.select('//h1/text()').extract())
        try:
            loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0])
        except:
            self.log('No SKU for %s' % (response.url))
        loader.add_xpath('price', '//span[@itemprop="price"]/text()')
        loader.add_xpath(
            'category',
            '//div[@class="paths"]/ul/li[1]/span[last()]//a/text()')

        img = hxs.select('//div[@class="images"]//img/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        loader.add_value('shipping_cost', '49')
        if hxs.select('//select[@name="num"]'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        yield loader.load_item()
コード例 #40
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        loader.add_xpath('identifier', '//td/span[contains(text(), "Artikelnr")]/../../td[2]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        loader.add_value('price', extract_price_eu(''.join(hxs.select('//td[@class="myshp_info_price_value"]//text()').extract())))
        sku = ''.join(hxs.select('//h1/text()').extract())
        try:
            loader.add_value('sku', re.search('(\d{3}\d*)', sku).groups()[0])
        except:
            self.log('No SKU for %s' % (response.url))
        loader.add_xpath('category', '//td/span[contains(text(), "Categorie")]/../../td[2]/text()')

        img = hxs.select('//div[@id="myshp_info_image_large"]//a/@href').extract()
        if img:
            loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))

        loader.add_value('brand', 'lego')
        if loader.get_output_value('price') > 75:
            loader.add_value('shipping_cost', '0')
        else:
            loader.add_value('shipping_cost', '4.95')
#        loader.add_xpath('stock', '1')

        yield loader.load_item()
コード例 #41
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', 'Le Creuset')
        loader.add_value('category', 'Le Creuset')
        loader.add_xpath('sku', '//input[@name="product"]/@value')
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        image_url = hxs.select(
            '//div[@class="product-img-box"]/a/@href').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        loader.add_xpath('price', '//div[@class="prodPriceWrap"]/h2/text()')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '4.75')
        else:
            loader.add_value('shipping_cost', '0')

        loader.add_value('stock', '1')

        item = loader.load_item()
        metadata = LeCreusetMeta()
        item['metadata'] = metadata

        yield item
コード例 #42
0
ファイル: lekmer_nl.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)

        product_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0]
        product_loader.add_value('name', product_name)

        image_url = hxs.select('//*[@id="zoom1"]/@src').extract()[0]
        product_loader.add_value('image_url', urljoin_rfc(base_url, image_url))

        product_loader.add_value('url', response.url)

        identifier = hxs.select('//input[@name="id"]/@value').extract()[0]
        product_loader.add_value('identifier', identifier)

        sku = hxs.select('//div[@class="product_band"]/p/span/text()').re('(\d+)')
        sku = sku[0] if sku else ''
        product_loader.add_value('sku', sku)

        price = hxs.select('//span[@class="campaignprice-value"]/text()').extract()
        if not price:
            price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if price:
            price = price[0].strip().replace(',', '.')
        product_loader.add_value('price', price)

        category = hxs.select('//ul[@class="breadcrumbs"]/li/a/text()').extract()
        category = category[-2] if category else ''
        product_loader.add_value('category', category)

        if product_loader.get_output_value('price')<100:
            product_loader.add_value('shipping_cost', 2.90)

        yield product_loader.load_item()
コード例 #43
0
 def parse(self, response):
     ''' First goes into the main categories, this site stores in cache 
         the current page, this is necessary to go to the next page.
     '''
     hxs = HtmlXPathSelector(response)
     next_page = hxs.select('//div[@class="page-navigation"]/a[contains(text(),"Next")]/@href').extract()
     if next_page:
         next_page = urljoin_rfc(get_base_url(response), next_page[0])
         yield Request(next_page)
     products =  hxs.select('//div[@id="list-product-list"]//div[contains(@class,"list-product-item")]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             name = ''.join(product.select('.//div[@class="name"]/a/text()').extract())
             if name:
                 loader.add_value('name', name)
                 # identifier = product.select('').extract()
                 #  if identifier:
                     # identifier = identifier[0]
                 # loader.add_value('identifier', identifier)
                 url = ''.join(product.select('.//div[@class="name"]/a/@href').extract())
                 if url:    
                     url = urljoin_rfc(get_base_url(response), url.split(';')[0])
                 loader.add_value('url', url)
                 price = product.select('.//div[@class="price-info"]//span[@class="current-price"]/text()').extract()
                 if price:
                     price = round(float(re.findall("\d+.\d+", price[0].replace(',', ''))[0])/1.2, 2)
                 loader.add_value('price', price)
                 yield Request(loader.get_output_value('url'), meta={'loader': loader}, callback=self.parse_product)
コード例 #44
0
ファイル: eymundsson.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        product_name = hxs.select(
            '//h1/span[@class="title"]/text()')[0].extract()
        product_price = hxs.select('//div[@class="price"]/span/p/strong/text()'
                                   ).re('([\d\.]+) kr.')[0]
        product_code = sku = hxs.select(
            '//div[@class="moreItem"]/span[@class="title" and contains(text(),"mer:")]/following-sibling::span/text()'
        ).extract()
        image_url = hxs.select('//a[@class="jqzoom"]/img/@src').extract()
        category = hxs.select(
            '//div[@class="moreItem"]/span[@class="title" and text()="Form:"]/following-sibling::span/text()'
        ).extract()
        category = category[0] if category else ''

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('sku', sku)
        loader.add_value('identifier', product_code)
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        loader.add_value('category', category)
        product_price = extract_price(
            product_price.replace('.', '').replace(',', '.'))
        loader.add_value('price', product_price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        yield loader.load_item()
コード例 #45
0
ファイル: aquariumsdelivered.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        options = hxs.select(u'//script').re('Product\.Bundle\((.*)\)')

        if options:
            options = json.loads(options[0])
            mandatory_options = hxs.select(u'//div[@class="input-box"]//input[@type="hidden"]')

            name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0].strip()
            price = Decimal(0.0)

            exclude = set()
            for mandatory_option in mandatory_options:
                option = mandatory_option.select(u'./@name').re('bundle_option\[(.*)\]')[0]
                selection = mandatory_option.select(u'./@value').extract()[0]
                option = options['options'][option]['selections'][selection]
                name += u' %s' % option['name'].strip()
                price += Decimal(option['price']).quantize(Decimal('0.01'))
                exclude.add(mandatory_option)

            option_keys = set(options['options'].keys()).difference(exclude)
            for option in option_keys:

                selection_keys = options['options'][option]['selections'].keys()
                for selection in selection_keys:
                    selection_name = options['options'][option]['selections'][selection]['name']
                    selection_price = options['options'][option]['selections'][selection]['price']
                    selection_price = Decimal(selection_price).quantize(Decimal('0.01'))

                    loader = ProductLoader(item=Product(), selector=hxs)
                    loader.add_value('url', response.url)
                    loader.add_value('name', name + u' %s' % selection_name.strip())
                    loader.add_value('price', price + selection_price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()')
        loader.add_xpath('price', u'//span[@class="regular-price"]/span[@class="price"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="price-box"]//p[@class="minimal-price" or @class="price-from"]/span[@class="price"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
コード例 #46
0
ファイル: amazon_spider.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            soup = BeautifulSoup(product.extract())
            loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string)
            loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href'])
            loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string)
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price'): 
                if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')):
                    if valid_price(response.meta['price'], loader.get_output_value('price')):
                        pr = loader

        if pr:
            yield pr.load_item()
コード例 #47
0
ファイル: thepetexpress.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@id="product"]/h1/text()')
        loader.add_xpath('price', u'//p[@class="price"]/span[@class="our_price"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
コード例 #48
0
ファイル: kalahari_amazon.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath("name", './/h3[@class="title"]/a/text()')

            loader.add_xpath("url", './/h3[@class="title"]/a/@href')
            loader.add_xpath("price", './/td[@class="toeOurPrice"]/a/text()')
            loader.add_value("sku", response.meta["sku"])

            if loader.get_output_value("price") and (
                pr is None or pr.get_output_value("price") > loader.get_output_value("price")
            ):
                pr = loader

        if pr:
            yield pr.load_item()
コード例 #49
0
ファイル: amazon.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()')

            loader.add_xpath('url', './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href')
            loader.add_xpath('price', './/*[@class="subPrice"]/a[contains(text(), "new")]' +
                                      '/following-sibling::*[@class="price"]/text()')
            loader.add_xpath('price', './/*[@class="newPrice"]//span/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'])
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()
コード例 #50
0
ファイル: vseinstrumenti.py プロジェクト: 0--key/lib
	def parse_product(self, response):
		hxs = HtmlXPathSelector(response)
		price = join(hxs.select(u'//div[contains(@class, "goods_price")]/text()').extract())
		price = price.strip().replace(" ","")

		product_loader = ProductLoader(item=Product(), selector=hxs)
		product_loader.add_value('name', response.meta["name"])
		product_loader.add_value('url', response.url)
		product_loader.add_value('price', price)
		product_loader.add_value('sku', response.meta["sku"])
		if product_loader.get_output_value('price'):			
			return product_loader.load_item()
コード例 #51
0
ファイル: lookandlisten.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="CB_box_prodview"]//h2/text()')
        loader.add_value('url', response.url)
        price = ''.join(hxs.select('//div[@class="viewprod_price"]//text()').extract())
        loader.add_value('price', price)
        loader.add_xpath('sku', '//div[@class="viewprod_right"]//div/text()', re='Barcode: (.*)')

        log.msg(loader.get_output_value('sku'))
        log.msg(response.meta['sku'])

        if loader.get_output_value('sku') == response.meta['sku']:
            yield loader.load_item()
        else:
            prods = response.meta['products']
            if prods:
                yield Request(urljoin_rfc(get_base_url(response), prods[0]),
                              callback=self.parse_product,
                              meta={'sku': response.meta['sku'], 'products': prods[1:]})
コード例 #52
0
ファイル: rubart.py プロジェクト: 0--key/lib
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@class="productShortInfo"]/h3/a/text()')
        if not loader.get_output_value('name'):
            loader.add_xpath('name', u'//h1[@itemprop="name"]/text()')
        price = hxs.select(u'//div[@class="price"]/strong/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        loader.add_value('sku', response.meta['sku'])
        yield loader.load_item()
コード例 #53
0
ファイル: cocopanda_spider.py プロジェクト: 0--key/lib
 def parse_node(self, response, node):
     if not isinstance(response, XmlResponse):
         return
     loader = ProductLoader(item=Product(), selector=node)
     url = node.select(u'./product-url/text()').extract()[0]
     loader.add_value('sku', url.split('/')[-2])
     loader.add_value('url', url)
     loader.add_xpath('name', u'./title/text()')
     price = node.select(u'./price/text()').extract()[0].replace(',', '.')
     loader.add_value('price', price)
     if loader.get_output_value('price'):
         return loader.load_item()
     else:
         return Product()
コード例 #54
0
ファイル: 6pm.py プロジェクト: 0--key/lib
 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     product = hxs.select('//div[@class="product-page"]')
     if product:
         loader = ProductLoader(item=Product(), selector=product)
         name = product.select('.//h1[@class="main-heading standard-header"]/a/text()').extract()
         name2 = product.select('.//h1[@class="main-heading standard-header"]/text()').extract()
         if name:
             price = "".join(product.select('.//span[@id="price"]/text()').re(r'([0-9\,\. ]+)')).strip()
             loader.add_value('name', name[0].strip() + ' ' + name2[0].strip())
             loader.add_value('url', response.url)
             loader.add_value('price', price)
             loader.add_value('sku', response.meta['sku'])
             if not 'apparelsave' in loader.get_output_value('name').lower():
                 yield loader.load_item()
     else:
         products = hxs.select('.//div[@id="searchResults"]/a')
         if products:
             for product in products:
                 name = product.select('./span[@class="brandName"]/text()').extract()
                 name2 = product.select('./span[@class="productName"]/text()').extract()
                 if name and name2:
                     product_name = name[0].strip() + ' ' + name2[0].strip()
                     product_words = product_name.lower().strip().split(' ')
                     search_words = response.meta['name'].lower().replace('+', ' ').split(' ')
                     diff = [w for w in search_words if not w in product_words]
                     if not diff:
                         price = "".join(product.select('./span[@class="price-6pm"]/text()').re(r'([0-9\,\. ]+)')).strip()
                         loader = ProductLoader(item=Product(), selector=product)
                         loader.add_value('name', product_name)
                         loader.add_value('url', urljoin_rfc(base_url,product.select('.//@href').extract()[0]))
                         loader.add_value('price', price)
                         loader.add_value('sku', response.meta['sku'])
                         if not 'apparelsave' in loader.get_output_value('name').lower():
                             yield loader.load_item()
                             break
コード例 #55
0
ファイル: newark.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select('//span[@id="totalNoResultsSlotAtTop"]'):
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@id="headerContainer"]/h1/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//span[contains(@class, "mfProductDescriptionAndPrice")]/text()')
        loader.add_xpath('sku', '//dt[text()="Manufacturer Part No:"]/following-sibling::dd/text()')
        sku = loader.get_output_value('sku')
        if sku.lower() != response.meta['sku'].lower():
            return

        yield loader.load_item()
コード例 #56
0
ファイル: petgoods4u.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        # name = hxs.select(u'//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()').extract()[0].strip()
        # options = hxs.select(u'//td[@id="optionProductList"]')
        # if options:
            # name += u' %s' % hxs.select(u'//ul[@id="active"]/li/a/text()').extract()[0].strip()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//h1[@class="product-name"]/text()')
        loader.add_xpath('price', u'//div[@class="p-prod-price"]/span/span[@class="price-alt"]/span/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
コード例 #57
0
ファイル: pickyourshoes.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select('//td[@style="padding-left:10px;"]/h1/text()').extract()
        
        loader = ProductLoader(item=Product(), response=response)
        
        if name:
            price = "".join(hxs.select('.//p[@class="productDesc"]/span[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip()
            loader.add_value('name', name[0].strip() )
            loader.add_value('url', response.url)
            loader.add_value('price', price)
            loader.add_value('sku', response.meta['sku'])

            if not 'apparelsave' in loader.get_output_value('name').lower():
                yield loader.load_item()
コード例 #58
0
ファイル: tooled_up_com.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//div[@class="headingbox"]/h1/text()')
     price = hxs.select('//span[@class="ourpricefeat"]/text()')
     if price:
         price_re = price.re('(\d+(?:\.\d+))')
         if price_re:
             product_loader.add_value('price', price_re[0])
     if not product_loader.get_output_value('price'):
         product_loader.add_value('price', 0)
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()