Esempi in Python per ProductLoaderWithNameStrip.add_value, esempi in Python per product_spiders.items.ProductLoaderWithNameStrip.add_value

Esempio n. 1

0

Mostra file

File: amazon_americanrv.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        search_results = []
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h3/a/span/text()')
            if not loader.get_output_value('name'):
                loader.add_xpath('name', './/h3/a/text()')
            loader.add_xpath('url', './/h3/a/@href')
            loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)')
            if not loader.get_output_value('price'):
                loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'].lower())
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')):
                pr = loader
                search_results.append(pr)

        # if pr:
            # yield pr.load_item()
        if search_results:
            cur_prod = search_results[0]
            next_prods = search_results[1:]
            yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids,
                          meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)

Esempio n. 2

0

Mostra file

File: kikkertpriser_dk.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//td[@class="Description_ProductList"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//a/@title')

            price = item.select(u'../..//span[@class="Price_Productlist"]/text()').extract()[0]
            price = price.strip().rstrip(' DKK').replace('.', '').replace(',', '.')
            if price == u'Ring for pris!':
                price = 0
            product_loader.add_value('price', price)

            url = item.select(u'.//a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//table[@id="ProductMenu_Table"]/../' + u'/'.join([u'table/tr/td'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

Esempio n. 3

0

Mostra file

File: ebay.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//td[@r="1"]')
        if not product:
            product = hxs.select('//table[@r="1"]')

        if not product and response.meta.get('_retries', 0) >= 3:
            #log.msg('ALERT! ' + response.url)
            #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w')
            #f.write(response.body)
            #f.close()

            return
        elif not product:
            retries = response.meta.get('_retries', 0)
            yield Request(response.url, meta={'sku': response.meta['sku'],
                                              '_retries': retries + 1},
                                              dont_filter=True)
            return

        loader = ProductLoader(item=Product(), selector=product)
        loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()')
        loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()')
        loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()')
        loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()')
        loader.add_xpath('price', './/*[@itemprop="price"]/text()')
        loader.add_value('sku', response.meta['sku'])
        loader.add_value('identifier', response.meta['sku'])

        if not 'apparelsave' in loader.get_output_value('name').lower() \
           and valid_price(response.meta['price'], loader.get_output_value('price')):
            yield loader.load_item()

Esempio n. 4

0

Mostra file

File: lystfiskeren_dk.py Progetto: 0--key/lib

    def parse(self, response):
        if response.url in self.junk_urls:
            return

        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//div[@class="item_wrapper"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//div[@class="name"]/a/text()')

            price = item.select(u'.//div[@class="price"]/text()[last()]').extract()[0]
            price = price.strip().lstrip('Kr. ').replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//div[@class="name"]/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//div[@id="shopnav"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

Esempio n. 5

0

Mostra file

File: dv247_spider.py Progetto: 0--key/lib

    def parse_pagination(self, response):
        URL_BASE = 'http://www.dv247.com/'

        hxs = HtmlXPathSelector(response)
        products = hxs.select('//div[@class="listItem clearfix"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            name = ''.join(product.select('.//a//text()').extract())
            loader.add_value('name', name)
            relative_url = product.select('.//a/@href')[0].extract()
            url = urljoin_rfc(URL_BASE, relative_url)
            loader.add_value('url', url)
            loader.add_xpath('price', './/li[@class="price"]/text()')
            yield loader.load_item()
        
        #next page
        next_pages = hxs.select('//div[@class="listPaging"]')
        next_ten = []
        if next_pages:
            next_ten = next_pages[0].select('.//a[text()="Next 10"]/@href').extract()
        
        if next_pages:
            next_pages = next_pages[0].select('.//a[not(@class="selectedpage") and not(text()="Next 10") and not(text()="Previous 10")]/@href').extract()
            for page in next_pages:
                url = urljoin_rfc(URL_BASE, page)
                yield Request(url, callback=self.parse_pagination)

        if next_ten:
            next_ten_url = urljoin_rfc(URL_BASE, next_ten[0])
            yield Request(next_ten_url, callback=self.parse_pagination)

Esempio n. 6

0

Mostra file

File: frankonia.py Progetto: 0--key/lib

    def parse_page(self, response):
        base_url = get_base_url(response)
        base_url_func = functools.partial(urljoin_rfc, base_url)

        hxs = HtmlXPathSelector(response)
        cats = hxs.select("//ul[@id='nav']//a/@href").extract()
        for url in cats:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_page)

        # next page
        hxs = HtmlXPathSelector(response)
        url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract()
        if url:
            yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page)

        # products
        for z in hxs.select("//div[@class='products']//li"):
            #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract()
            loader = ProductLoader(selector=z, item=Product())
            loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_xpath('url', "@data-product-url", first, base_url_func)
            loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/span[@class='brand']/text()")
            loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/a[starts-with(@class, 'name')]/text()")
            price = z.select(".//p[@class='price']/ins//text()") \
                    or z.select(".//p[@class='price']//text()") \
                    or z.select(".//p[@class='price']/del//text()")

            price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '')
            loader.add_value('price', price)

            yield loader.load_item()

Esempio n. 7

0

Mostra file

File: teds_spider.py Progetto: 0--key/lib

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//*[@id="products-list"]/li')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'div[@class="product-details left"]/h2/a/text()')
         price = product.select('div[@class="product-shop left"]/div/div/p/span/span/text()')
         if price:
             price = price[0]
         else:
             price = product.select('div[@class="product-shop left"]/div/div/span/text()')
             if price:
                 price = price[0]
             else:
                 price = product.select('div[@class="product-shop left"]/div/div/p/span/text()')
                 if len(price)==1:
                     price = price[0]
                 else:
                     price = price[1]
         loader.add_value('price', price)
         loader.add_xpath('url', 'div[@class="product-details left"]/h2/a/@href')
         yield loader.load_item()
     next = hxs.select('//div[@class="right-nav right"]/a/@href').extract()
     if next:
         url = next[0]
         yield Request(url, callback=self.parse_products)

Esempio n. 8

0

Mostra file

File: midwestunlimited_com.py Progetto: 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h2/text()')
        product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()')
        product_loader.add_xpath('sku', u'//span[@class="VariationProductSKU"]/text()')
        product_loader.add_xpath('category', u'//div[@id="ProductBreadcrumb"]/ul/ul/li[2]/a/text()')
        product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]/a/img/@src')
        product_loader.add_xpath('brand', u'//div[@class="Value"]/a/text()')
        product_loader.add_value('shipping_cost', '')


        options = hxs.select(u'//div[@class="DetailRow"]//ul/li/label/input/../..')
        if options:
            product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0]
            product_orig = product_loader.load_item()
            for opt in options:
                name = opt.select(u'.//input/../text()[2]').extract()
                if not name:
                    name = opt.select(u'concat(.//input/../span[1]/text(),.//input/../span[2]/text())').extract()
                var = opt.select(u'.//input/@value').extract()

                product = Product(product_orig)
                product['name'] = (product['name'] + ' ' + name[0].strip()).strip()
                yield Request('http://www.midwestunlimited.com/remote.php' +
                        '?w=GetVariationOptions&productId=' + product_id + '&options=' + var[0],
                        meta={'product': product}, callback=self.parse_price)
        else:
            yield product_loader.load_item()

Esempio n. 9

0

Mostra file

File: testequipmentdepot.py Progetto: 0--key/lib

    def parse_products(self, hxs, response):
        print response.encoding
        model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                               ' and text()="Model"]/preceding-sibling::*) + 1').extract()
        description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                     ' and text()="Description"]/preceding-sibling::*) + 1').extract()
        price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                ' and text()="Price"]/preceding-sibling::*) + 1').extract()

        if model_pos and description_pos and price_pos:
            model_pos = model_pos[0].split('.')[0]
            description_pos = description_pos[0].split('.')[0]
            price_pos = price_pos[0].split('.')[0]

            products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \
                                   and not(text()="Model")]/..' % model_pos)
            for product in products:
                loader = ProductLoader(selector=product, item=Product())
                url = response.url
                model_url = product.select('.//td[starts-with(@class, "orderinfo") \
                                            and position()=%s]//a/@href' % model_pos).extract()
                if model_url:
                    url = urljoin_rfc(get_base_url(response), model_url[0])

                loader.add_value('url', url)
                loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos)
                loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos)
                if not loader.get_output_value('price') or not loader.get_output_value('name').strip():
                    continue

                yield loader.load_item()

Esempio n. 10

0

Mostra file

File: justvitamins.py Progetto: 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract()
        if name:
            name = name[0].strip()
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            items = hxs.select('//div[@class="Item"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=item)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])

                sku = ''.join(item.select('./text()').extract())
                n = name
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                loader.add_xpath('price', './/span[@class="price"]/text()')
                loader.add_xpath('price', './div[@class="price"]/span/text()')


                yield loader.load_item()

Esempio n. 11

0

Mostra file

File: naturbutikken_dk.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        
        for item in hxs.select(u'//div[@class="prelement"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//a/text()')

            price = item.select(u'.//p[@class="prpri"]/text()').extract()[0]
            price = price.strip().lstrip('Pris: DKK ').replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//ul[@id="pMenul0"]/../' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()

        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

Esempio n. 12

0

Mostra file

File: fragrancenet.py Progetto: 0--key/lib

    def parse_products(self, hxs, response):
        products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]')

        for p in products:
            loader = ProductLoader(item=Product(), selector=p)

            name = p.select('.//p[@class="para1"]//text()').extract()
            name = ' '.join([n.strip() for n in name])
            name = re.sub(' +', ' ', name)

            loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href')
            loader.add_value('name', name)
            loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)')
            loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)')

            if not loader.get_output_value('price'):
                yield Request(loader.get_output_value('url'), callback=self.parse_products2)
                continue



            if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\
               and response.meta.get('ret', 0) < 3:

                yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1})
                return

            yield loader.load_item()

Esempio n. 13

0

Mostra file

File: jagtdirekt_dk.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//tr[contains(@class,"product-item")]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()')

            price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0]
            price = price.strip().replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            # If quantity field is not present on page, there are subproducts
            qty = item.select(u'.//input[@name="products_qty"]').extract()
            if qty:
                yield product_loader.load_item()
            else:
                yield Request(url, callback=self.parse_sub)

        level = response.meta.get('level', 1)
        sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

        next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract()
        if next_url:
            next_url = urljoin_rfc(get_base_url(response), next_url[0])
            yield Request(next_url, meta={'level': level})

Esempio n. 14

0

Mostra file

File: pondsuperstores_spider.py Progetto: 0--key/lib

 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()')
     loader.add_value("url", response.url)
     loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()')
     loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()')
     yield loader.load_item()

Esempio n. 15

0

Mostra file

File: virginmobile_spider.py Progetto: 0--key/lib

 def parse(self, response):
     BASE_URL = 'http://www.virginmobile.com/vm/'
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="webapp_shophome_3col_spotlight"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         xpath = 'div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()'
         if product.select(xpath):
             loader.add_xpath('name', xpath)
             loader.add_xpath('price', 'div/div/div/div/div/div/p/span/text()')
             relative_url = product.select('div/div/div/div/div/p/a/@href')
             if relative_url:
                 url = urljoin_rfc(BASE_URL, relative_url.extract()[0], 
                                   response.encoding)
                 loader.add_value('url', url)
         else:
             xpath = 'div/div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()'
             if product.select(xpath):
                 loader.add_xpath('name', xpath)
                 loader.add_xpath('price', 'div/div/div/div/div/div/div/p/span/text()')
                 relative_url = product.select('div/div/div/div/div/div/p/a/@href')
                 if relative_url:
                     url = urljoin_rfc(BASE_URL, relative_url.extract()[0], 
                                       response.encoding)
                     loader.add_value('url', url)
         yield loader.load_item()

Esempio n. 16

0

Mostra file

File: amazon_spider.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()')
            # if not accept_product(loader.get_output_value('name')):
            #    continue
            loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href')
            loader.add_xpath("price", './/*[@class="newPrice"]//span/text()')
            loader.add_value("sku", response.meta["sku"])
            loader.add_value("identifier", response.meta["sku"])
            # loader.add_value('sku', response.meta['sku'])
            # loader.add_value('identifier', response.meta['sku'])
            if (
                loader.get_output_value("price")
                and (pr is None or pr.get_output_value("price") > loader.get_output_value("price"))
                and valid_price(response.meta["price"], loader.get_output_value("price"))
            ):
                pr = loader

        if pr:
            yield pr.load_item()

Esempio n. 17

0

Mostra file

File: portonaquapet.py Progetto: 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip()
        multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option')
        if multiple_options and not u'requested' in response.meta:
            for option in multiple_options:
                formname = u'aspNetForm'
                formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0],
                            u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList',
                            u'__EVENTARGUMENT' : u''}
                req = FormRequest.from_response(response, formname=formname,
                                                    formdata=formdata,
                                                    meta={u'requested': True},
                                                    dont_click=True, callback=self.parse_product)
                yield req
        if multiple_options:
            name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()

Esempio n. 18

0

Mostra file

File: surreypetsupplies_spider.py Progetto: 0--key/lib

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href')
        if nextPageLink:
            yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products)

        products = hxs.select('//div[@id="center-main"]//div[@class="details"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            loader.add_xpath("name", "a/text()")
            loader.add_xpath("sku", 'div[@class="sku"]/span/text()')

            # few prices were under div class desc
            price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()')
            if price_selector:
                price = price_selector[0].extract()
            else:
                price = "0.0"

            loader.add_value("price", price)

            relative_url = product.select("a/@href")[0].extract()
            loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url))

            yield loader.load_item()

Esempio n. 19

0

Mostra file

File: dolphinmusic_spider.py Progetto: 0--key/lib

 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="item"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'h2/a/text()')
         relative_url = product.select('h2/a/@href').extract()[0]
         url = urljoin_rfc('http://www.dolphinmusic.co.uk/', 
                           relative_url, response.encoding)
         loader.add_value('url', url)
         loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()')
         yield loader.load_item()
     next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract()
     if not next_page:
         relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract()
         for relative_url in relative_urls:
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                               relative_url, response.encoding)
             yield Request(url, callback=self.parse_page)
     else:
         next_url = next_page[-1]
         if self._is_next(next_url):
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                                next_url, response.encoding)
             yield Request(url, callback=self.parse_page)

Esempio n. 20

0

Mostra file

File: gmesupply_com.py Progetto: 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        opt_groups = []
        def fix_options(o):
            try:
                return (o[0], o[1].replace(',', ''))
            except:
                return (o[0], '0')

        for option in hxs.select(u'//div[@class="input-box"]//select'):
            opt_list = option.select(u'./option[position() != 1]/text()').extract()
            opt_list = [o.replace('+$', '$').split('$') for o in opt_list]
            opt_groups.append([fix_options(o) for o in opt_list])

        for opt_name, opt_price in multiply(opt_groups):
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name', u'//h2[@class="title"]/text()')
            product_loader.add_xpath('price', u'//span[contains(@class,"sale-price")]/text()')
            product_loader.add_xpath('sku', u'substring-after(//span[contains(@class,"meta-sku")]/text(),":")')
            product_loader.add_xpath('category', u'//ul[@class="breadcrumb"]/li[2]/a/@title')
            product_loader.add_xpath('image_url', u'//div[@class="teaser-large"]/img/@src')
            product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")')
            product_loader.add_value('shipping_cost', '')

            product = product_loader.load_item()
            product['name'] = (product['name'] + ' ' + opt_name).strip()
            product['price'] = product['price'] + Decimal(opt_price)
            yield product

Esempio n. 21

0

Mostra file

File: pixmania_spider.py Progetto: 0--key/lib

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'div/h3/a/@href')
             if product.select('div/h3/a/abbr/@title'):
                 loader.add_xpath('name', 'div/h3/a/abbr/@title')
             else:
                 loader.add_xpath('name','div/h3/a/text()')
             price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()
     else:
         products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]')
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'td/h3/a/@href')
             loader.add_xpath('name', 'td/h3/a/text()')
             if product.select('td/p/strong/text()').extract():
                 price = product.select('td/p/strong/text()').extract()[0]
             else:
                 if product.select('td/div/p/strong/text()').extract():
                     price = product.select('td/div/p/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()

Esempio n. 22

0

Mostra file

File: screwfix.py Progetto: 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//span[@itemprop="price"]/text()')

        yield loader.load_item()

Esempio n. 23

0

Mostra file

File: instawares.py Progetto: 0--key/lib

    def parse_product(self, response):
        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@class="productName fn"]/text()')
        loader.add_xpath('price', '//li[@class="price"]//text()')
        loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' +
                                '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()')

        yield loader.load_item()

Esempio n. 24

0

Mostra file

File: beautycos_spider.py Progetto: 0--key/lib

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//*[@id="header"]/text()')
     loader.add_value('url', response.url)
     price = ''.join(hxs.select('//*[@id="productdesc"]/font/font/text()').extract()).replace('.','').replace(',','.')
     if price:
         price = price.split(':')[-1]
     loader.add_value('price', price)
     yield loader.load_item()

Esempio n. 25

0

Mostra file

File: wasserstrom.py Progetto: 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('name', '//h1[@id="partNameId"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip()
        loader.add_value('sku', sku)

        yield loader.load_item()

Esempio n. 26

0

Mostra file

File: myflukestore.py Progetto: 0--key/lib

 def parse_products(self, hxs, response):
     products = hxs.select('//h3[@class="product_name"]/../..')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/h3[@class="product_name"]/a/text()')
         url = product.select('.//h3[@class="product_name"]/a/@href').extract()[0]
         url = urljoin_rfc(get_base_url(response), url)
         loader.add_value('url', url)
         loader.add_xpath('price', './/p[@class="price"]/text()')
         yield loader.load_item()

Esempio n. 27

0

Mostra file

File: wyeomans_spider.py Progetto: 0--key/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     if self.products.has_key(response.url):
         sku = self.products[response.url]
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value('sku', sku)
         loader.add_value('url', response.url)
         loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()')
         loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()')
         return loader.load_item()

Esempio n. 28

0

Mostra file

File: beautyencounter.py Progetto: 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//*[@itemprop="price"]/text()')
        loader.add_value('url', response.url)

        yield loader.load_item()

Esempio n. 29

0

Mostra file

File: axminster_co_uk.py Progetto: 0--key/lib

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()')
     product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()')
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()

Esempio n. 30

0

Mostra file

File: procamerashop_spider.py Progetto: 0--key/lib

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()')
        loader.add_xpath('price', '//div[@id="productDetail"]//span[contains(@class,"price")]/text()')
        sku = hxs.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)')
        loader.add_value('sku', sku)

        yield loader.load_item()

Esempio n. 31

0

Mostra file

File: bmstores_spider.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['PRODUCT_NUMBER'])
        loader.add_value('sku', row['PRODUCT_NUMBER'])
        loader.add_xpath('brand',
                         '//div[@class="product-detail-logo"]/a/img/@alt')
        categories = hxs.select(
            '//ul[@id="breadcrumbs"]/li/a/text()').extract()[1:-1]
        loader.add_value('category', categories)
        loader.add_xpath('name', '//h1[@class="content-title"]/text()')

        price = hxs.select(
            '//article[@class="aside-content"]/h2/span/text()').extract()
        if not price:
            price = hxs.select(
                '//article[@class="aside-content"]/h2/text()').extract()
        loader.add_value('price', price)

        loader.add_value('url', response.url)
        image_url = hxs.select(
            '//div[@class="product-detail-feature-img"]/a/img/@src').extract()
        image_url = urljoin_rfc(get_base_url(response),
                                image_url[0]) if image_url else ''
        loader.add_value('image_url', image_url)
        yield loader.load_item()

Esempio n. 32

0

Mostra file

File: thebedwarehousedirect.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select(
            '//div[@class="product-name"]/span/text()').extract()[0].strip()
        identifier = hxs.select('//input[@name="product"]/@value').extract()[0]
        price = hxs.select(
            '//form[@id="product_addtocart_form"]//span[@class="price"]/text()'
        ).extract()
        price = extract_price(price[0])

        loader = ProductLoader(selector=hxs, item=Product())
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        image_url = image_url[0] if image_url else ''
        loader.add_value('image_url', image_url)
        categories = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[1:]
        loader.add_value('category', categories)
        loader.add_value('url', response.url)

        product = loader.load_item()

        options_containers = hxs.select(
            '//select[contains(@class, "product-custom-option")]')

        if options_containers:
            options = []
            if len(options_containers) > 1:
                combined_options = []
                for options_container in options_containers:
                    element_options = []
                    for option in options_container.select(
                            'option[@value!=""]'):
                        option_id = option.select('@value').extract()[0]
                        option_name = option.select(
                            'text()').extract()[0].split(u'+\xa3')[0].strip()
                        option_price = option.select('text()').re('(\d+.\d+)')
                        option_price = extract_price(
                            option_price[0]) if option_price else 0
                        option_attr = (option_id, option_name, option_price)
                        element_options.append(option_attr)
                    combined_options.append(element_options)
                combined_options = list(itertools.product(*combined_options))

                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + ' ' + option[1]
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + '-' + option[0]
                        final_option['price'] = final_option.get(
                            'price', 0) + extract_price(option[2])
                        options.append(final_option)
            else:
                for option in options_containers.select('option[@value!=""]'):
                    final_option = {}
                    final_option['desc'] = ' ' + option.select(
                        'text()').extract()[0].split('(+')[0].strip()
                    final_option['identifier'] = '-' + option.select(
                        '@value').extract()[0]
                    option_price = option.select('text()').re('\(\+(.*)\)')
                    final_option['price'] = extract_price(
                        option_price[0]) if option_price else 0
                    options.append(final_option)

            yield product
            for option in options:
                if not option['price']:
                    continue
                option_product = deepcopy(product)
                option_product['identifier'] = option_product[
                    'identifier'] + option['identifier']
                option_product[
                    'name'] = option_product['name'] + option['desc']
                option_product[
                    'price'] = option_product['price'] + option['price']
                option_product['sku'] = option_product['identifier']
                yield option_product
        else:
            yield product

Esempio n. 33

0

Mostra file

File: klossbutiken_se.py Progetto: oceancloud82/scraping

    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[@class="product-wrapper"]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            name = product.select('.//h3//text()').extract()[0]
            product_loader.add_value('name', name)
            sku = ''
            for match in re.finditer(r"([\d,\.]+)", name):
                if len(match.group()) > len(sku):
                    sku = match.group()
            product_loader.add_value('sku', sku)
            image_url = product.select(
                './div[@class="product-image"]//img/@data-original').extract()
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = product.select('./div[@class="product-price"]//span[@class="price-amount"]/text()').extract()[0].strip()\
                .strip(' Kr').replace('.', '')
            product_loader.add_value('price', extract_price(price))
            if product_loader.get_collected_values(
                    'price'
            ) and product_loader.get_collected_values('price')[0] < 1500:
                product_loader.add_value('shipping_cost', '49')
            buy_button = product.select(
                './div[@class="product-buttons"]/a[@class="buy-button"]')
            if not buy_button:
                product_loader.add_value('stock', 0)
            url = product.select(
                './div[@class="product-buttons"]/a[@class="button-info"]/@href'
            ).extract()[0]
            product_loader.add_value('url', urljoin_rfc(base_url, url))
            identifier = product.select(
                './div[@class="product-name"]//@data-productid').extract()[0]
            product_loader.add_value('identifier', identifier)
            product = product_loader.load_item()
            yield product

        pages = hxs.select('//a[@class="paging-link-box"]/@href').extract()
        for url in pages:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product_list)

Esempio n. 34

0

Mostra file

    def parse_product(self, response):
        """
        No shipping cost found
        """
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brand = hxs.select(
            '//table[@id="product-attribute-specs-table"]//th[@class="label" and contains(text(), "Manufacturer")]/following-sibling::*/text()'
        ).extract()[0]

        loader = ProductLoader(response=response, item=Product())

        #price = hxs.select('//*[@id="price-including-tax-6649"]//text()').re(r'[\d.,]+')
        price = None
        if not price:
            price = hxs.select(
                '//div[@class="productBox"]//div[@class="price-box"]/p[@class="price-to"]/span[@class="price-including-tax"]/span[@class="price"]/text()'
            ).extract()
        if not price:
            price = hxs.select(
                '//div[@class="productBox"]//div[@class="price-box"]//span[@class="price-including-tax"]/span[@class="price"]/text()'
            ).extract()
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('price', '0.0')
            loader.add_value('stock', '0')

        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_value('url', urljoin(base_url, response.url))
        loader.add_value('brand', brand)
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')

        image_url = hxs.select('//img[@id="zoom"]/@src').extract()
        if not image_url:
            image_url = hxs.select('//a[@id="ma-zoom1"]/@href').extract()
        loader.add_value('image_url', image_url)
        loader.add_value('category', response.meta.get('category', ''))
        yield loader.load_item()

Esempio n. 35

0

Mostra file

File: pixmania_spider.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta

        products = hxs.select(
            '//form//div[contains(@class, "resultList")]/article'
            '//*[contains(@class, "productTitle")]/a/@href').extract()
        if products:
            for x in self.parse(response):
                yield x
            return

        base_url = get_base_url(response)

        price = hxs.select(
            '//div[@class="row"]//span[@class="currentPrice"]/ins[@itemprop="price"]/text()'
        ).extract()
        if not price:
            price = "0.0"
        else:
            price = price.pop()

        identifier = response.url.split('/')[-1].split('-')[0]

        try:
            main_name = hxs.select(
                '//span[@itemprop="name"]/text()').extract()[0].strip()
        except:
            main_name = ''
        try:
            brand = hxs.select(
                '//span[@itemprop="brand"]/text()').extract()[0].strip()
        except:
            brand = ''

        product_name = brand + ' ' + main_name
        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()

        stock = hxs.select(
            '//div[contains(@class, "availability")]/div/strong[contains(@class, "available")]/i[@class="icon-ok"]'
        )

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        loader.add_value('price', extract_price(price))
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('image_url', image_url)
        categories = hxs.select(
            '//div[@class="breadcrumb"]/ul/li/a/span/text()').extract()[1:]
        for category in categories:
            loader.add_value('category', category.encode(response.encoding))
        if not stock:
            loader.add_value('stock', 0)
        shipping_cost = hxs.select(
            '//div/strong[@class="weee"]/text()').extract()
        if shipping_cost:
            shipping_cost = extract_price(shipping_cost[0])
            loader.add_value('shipping_cost', shipping_cost)
        yield loader.load_item()

Esempio n. 36

0

Mostra file

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brand = hxs.select("//div[@id='fiche-produit-description-inspiration']/text()").extract()
        brand = brand[0].split(' par')[1].strip() if brand else ''

        product_name = ''.join(hxs.select('//*[@id="fiche-produit-description-titre1"]/text()').extract()).strip()
        img = hxs.select('//*[@id="product-main-image"]/@src').extract()
        category = hxs.select('//ul[@class="breadcrumb"]//span[@itemprop="title"]/text()').extract()[:-1]
        product_identifier = hxs.select('//input[@name="product_id"]/@value').extract()[0]

        for option in hxs.select('//*[@id="product-option-selector"]//option'):
            loader = ProductLoader(item=Product(), selector=hxs)
            name = option.select('./text()').extract()[0].strip()
            name = ' '.join(s.strip() for s in name.split('\n'))
            name = name.replace('(Hors stock)', '').strip()
            if name != '':
                name = product_name + ' - ' + name
            price = option.select('./@data-price').extract()[0].replace(u'\u20ac', '').strip()
            price = extract_price_eu(price)
            identifier = option.select('./@value').extract()[0]
            loader.add_value('identifier', product_identifier + '_' + identifier)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('brand', brand)
            loader.add_value('price', price)
            stock = option.select('./@data-quantity').extract()[0]
            if stock == '0':
               loader.add_value('stock', 0)
            if img:
                loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))
            if price < 300:
                loader.add_value('shipping_cost', 19)
            loader.add_value('category', category)
            yield loader.load_item()

Esempio n. 37

0

Mostra file

File: stickersandgadgets.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath(
            'name',
            '//div[contains(@class,"product-info")]//h1[@id="product-name"]/span[@itemprop="name"]/text()'
        )
        loader.add_value('url', response.url)
        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])
            loader.add_value('image_url', image_url)
        loader.add_value('brand', 'Stickers & Gadgets')
        for category in hxs.select(
                '//span[contains(@itemtype,"Breadcrumb")]/a/span/text()'
        )[1:].extract():
            loader.add_value('category', category)
        loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))

        item = loader.load_item()

        reg = re.compile('ekmProductVariantData.+?(\{.+\})', re.DOTALL)
        options = hxs.select('//script/text()').re(reg)
        if options:
            options = options[0].replace('\r\n', '')
            options = re.sub(".'item8.+?}}}}", "}}", options)
            options = eval(options)
            for option in options['items']:
                if not option['selector']:
                    continue
                loader = ProductLoader(item=item, selector=hxs)
                loader.add_xpath(
                    'name',
                    '//div[contains(@class,"product-info")]//h1[@id="product-name"]/span[@itemprop="name"]/text()'
                )
                for attr in option['selector']:
                    loader.add_value('name', attr['value'])
                identifier = response.meta.get(
                    'row').get('PRODUCT_NUMBER'
                               ) + '-' + option['properties']['item1']['value']
                loader.add_value('identifier', identifier)
                loader.add_value('price',
                                 option['properties']['item3']['innerHTML'])
                yield loader.load_item()
        else:
            loader.add_value('identifier',
                             response.meta.get('row').get('PRODUCT_NUMBER'))

            price = hxs.select(
                '//div[contains(@class,"product-info")]//span[@itemprop="price"]/@content'
            ).extract()
            if price:
                price = format_price(Decimal(price[0]) * Decimal('1.2'))
            else:
                price = Decimal('0.00')
            loader.add_value('price', price)

            yield loader.load_item()

Esempio n. 38

0

Mostra file

File: phones4u_spider.py Progetto: oceancloud82/scraping

    def parse_operator(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta
        tariffs = hxs.select(
            '//table[contains(@class, "price-plans")]/tr[td[contains(@class, "col")]]'
        )
        name = ' '.join(
            hxs.select('//h3[@class="handset-name"]/text()').extract()
            [0].split())
        for tariff in tariffs:
            loader = ProductLoader(selector=tariff, item=Product())
            tariff_name = ' '.join(' '.join(
                tariff.select(
                    'td[@class="col1" or @class="col2" or @class="col3" or @class="col4"]/child::*/text()'
                ).extract()).split())
            monthly_cost = tariff.select(
                'td[contains(@class, "col7")]/h4/text()').extract()[0]
            duration = u'24'
            #product_code = tariff.select('input[@name="productCode"]/@value').extract()[0]
            net_gen = '4G' if 'generation=4G' in response.url else '3G'
            tariff_code = tariff.select(
                'td[contains(@class, "col7")]/div/form/input[@name="packageCode"]/@value'
            ).extract()[0]
            loader.add_value('identifier', tariff_code)
            loader.add_value(
                'name', response.meta['device_name'] + ' - ' + tariff_name)
            loader.add_value('url', response.url)
            loader.add_value('brand', name.split()[0])
            price = tariff.select(
                'td[contains(@class, "col6")]/h4/text()').extract()
            loader.add_value('price', price)
            image_url = hxs.select(
                '//span[@class="handset-image"]/img/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            product = loader.load_item()
            metadata = TelecomsMeta()
            metadata['device_name'] = meta['device_name']
            metadata['monthly_cost'] = monthly_cost.replace(u'\xa3', '')
            metadata['tariff_name'] = tariff_name
            metadata['contract_duration'] = duration
            metadata['operator'] = meta['operator']
            metadata['channel'] = channel
            metadata['network_generation'] = net_gen
            product['metadata'] = metadata

            yield product

        next = hxs.select(
            '//a[i[contains(@class, "i-right-arrow-white")] and contains(@href, "page")]/@href'
        ).extract()
        if next:
            url = urljoin_rfc(get_base_url(response), next[0])
            yield Request(url, callback=self.parse_operator, meta=meta)

Esempio n. 39

0

Mostra file

File: bikediscount.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        try:
            brand_name = response.xpath(
                '//span[@class="manufacturer"]/text()').extract()[0]
            name = response.xpath(
                '//div[@id="product-box"]//div[@class="title"]/text()'
            ).extract()[0].strip()
        except:
            self.log('No brand or name found: %s' % response.url)
            return

        if response.xpath(
                '//div[@class="no-valid-variants" and contains(text(), "this item is currently not available")]'
        ):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', brand_name + ' ' + name)
        sku = response.xpath(
            '////div[@class="additional-product-no"]/@data-xencoded').extract(
            )
        if sku:
            sku = sku[0]
            h = HTMLParser.HTMLParser()
            key, data = sku.split(':', 1)
            key = int(key)
            data = h.unescape(data)
            # XOR decoding
            data = [ord(c) ^ key for c in data]
            data = ''.join([chr(c) for c in data])
            sku = re.search('Manufacturer Item no\. (.*)', data)
            if sku:
                sku = sku.group(1)
                # 'Hersteller Artikelnr: 20050/20051'
                product_loader.add_value('sku', sku)
        # product_loader.add_xpath('sku', u'//div[@class="additional-product-no" and contains(text(), "Manufacturer Item no.")]', re=r'Manufacturer Item no\. (.*)')
        identifier = response.xpath(
            '//input[@name="vw_id"]/@value').extract()[0]
        product_loader.add_value('identifier', identifier)

        price = response.xpath(
            '//div[@class="current-price"]/span[@class="price"]/text()'
        ).extract()
        if not price:
            price = response.xpath(
                '//table[@class="product-price"]//tr[@class="price"]/td/text()'
            ).extract()
        if price:
            price = price[0]
            product_loader.add_value('price', extract_price_eu(price))
        else:
            self.log('No product price found: %s' % response.url)
            return

        category = response.css('.uk-breadcrumb a::text').extract()[-1]

        product_loader.add_value('category', category)

        product_loader.add_value('brand', brand_name.strip())

        try:
            image_url = response.urljoin(
                response.xpath('//img[@itemprop="image"]/@src').extract()[0])
            product_loader.add_value('image_url', image_url)
        except:
            pass
        product = product_loader.load_item()

        rrp = extract_price_eu(''.join(
            response.xpath('//span[@class="retail-value"]/text()').extract()))
        rrp = str(rrp) if rrp > extract_price_eu(price) else ''

        options = response.xpath(
            '//div[contains(@id,"artikel_element_prices")]')
        if options:
            for opt in options:
                p = Product(product)
                optname = opt.xpath(
                    './/meta[@itemprop="name"]/@content').extract()[0]
                p['name'] = optname
                p['price'] = extract_price(
                    opt.xpath('.//meta[@itemprop="price"]/@content').extract()
                    [0])
                p['identifier'] = p['identifier'] + '-' + opt.xpath('@id').re(
                    'artikel_element_prices(.*)')[0]
                if p['identifier'] not in self.identifiers:
                    self.identifiers.append(p['identifier'])
                    yield p
        else:
            if product['identifier'] not in self.identifiers:
                self.identifiers.append(product['identifier'])
                yield product

Esempio n. 40

0

Mostra file

File: tedcarter.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        image_url = response.xpath('//div[@class="main-image"]/img/@src').extract()
        if image_url:
            image_url = response.urljoin(image_url[-1])
        category = response.xpath(u'//ol[@id="breadcrumbs"]/li/a/span[@itemprop="title"]/text()').extract()
        category = category[0] if category else ''
        brand = response.meta['brand']

        multiple_prices = response.xpath('//label[text()="Options"]/../select/option')
        if not multiple_prices:
            identifier = response.xpath('//input[@name="sku"]/@value').extract()
            if not identifier:
                return
            else:
                identifier = identifier[0]
            price = response.xpath('//div[@class="price"]/span[@class="text"]/text()').re(r'[\d\.,]+')
            if not price:
                price = response.xpath('//div[@class="price"]/span[@class="text"]//text()').re(r'[\d\.,]+')
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_xpath('name', '//div[@class="name"]/h1/text()')
            if image_url:
                product_loader.add_value('image_url', image_url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product_loader.add_value('url', response.url)
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('price', price)
            item = product_loader.load_item()

            # Try to solve "same product" issue but different name, price and url
            # Will be collected the lower price
            if item['identifier'] in self._products:
                item['name'] = self._products[item['identifier']]['name']
                item['url'] = self._products[item['identifier']]['url']
            else:
                self._products[item['identifier']] = {
                    'name': item['name'],
                    'url': item['url'],
                }

            yield item
        else:
            for name_and_price in multiple_prices:
                product_loader = ProductLoader(item=Product(), selector=name_and_price)
                name = response.xpath('//div[@class="name"]/h1/text()').extract()[0]
                name += ' ' + name_and_price.select('text()').extract()[0].strip()
                try:
                    opt_id = name_and_price.select('@data-sku').extract()[0]
                except:
                    continue
                product_loader.add_value('name', name)
                if image_url:
                    product_loader.add_value('image_url', image_url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                product_loader.add_value('url', response.url)
                product_loader.add_value('identifier', opt_id)
                price = name_and_price.select('@data-price').extract()
                if not price:
                    price = name_and_price.select(u'./td[position()=2]/p[@class="now-table"]/text()').extract()
                if not price:
                    price = name_and_price.select(u'.//*[@itemprop="price"]/text()').extract()
                product_loader.add_value('price', price)

                if name_and_price.select('@data-stock').extract() == ['0']:
                    continue
                item = product_loader.load_item()

                # Try to solve "same product" issue but different name, price and url
                # Will be collected the lower price
                if item['identifier'] in self._products:
                    item['name'] = self._products[item['identifier']]['name']
                    item['url'] = self._products[item['identifier']]['url']
                else:
                    self._products[item['identifier']] = {
                        'name': item['name'],
                        'url': item['url'],
                    }

                yield item

Esempio n. 41

0

Mostra file

 def parse_product(response):
     identifier = response.xpath('//div[@class="nosto_product"]/span[@class="product_id"]/text()').extract_first()
     name = response.xpath('//div[@class="nosto_product"]/span[@class="name"]/text()').extract_first()
     price =response.xpath('//div[@class="nosto_product"]/span[@class="price"]/text()').extract_first()
     category = response.xpath('//div[@class="nosto_product"]/span[@class="category"]/text()').extract_first()
     category = category.split('/')[1:]
     image_url = response.xpath('//div[@class="nosto_product"]/span[@class="image_url"]/text()').extract_first()
     variations = response.xpath('//a[@class="button btn-cart basket-below"]')
     if variations:
         variations = response.xpath('//*[@id="super-product-table"]/tbody/tr')
         for variant in variations:
             o_name = name
             for option in variant.xpath('./td[@fil-id!=""]/span/text()').extract():
                 if option != 'Yes':
                     o_name += ' ' + option
             o_id = variant.xpath('.//input/@name').extract_first()
             if not o_id:
                 continue
             o_id = o_id.replace('super_group[', '')[:-1]
             o_sku = variant.xpath('.//span[@class="sku"]/text()').extract_first()
             o_price = variant.xpath('.//span[@class="break-price"]/text()').extract_first()
             loader = ProductLoader(item=Product(), response=response)
             loader.add_value('name', o_name)
             loader.add_value('identifier', o_id)
             loader.add_value('sku', o_sku)
             loader.add_value('category', category)
             loader.add_value('url', response.url)
             loader.add_value('image_url', response.urljoin(image_url))
             loader.add_value('price', o_price)
             option_item = loader.load_item()
             yield option_item
     else:
         sku = response.xpath('//span[@class="product-ids"]/text()').extract_first()
         if sku:
             sku = sku.replace('Item code: ', '')
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value('name', name)
         loader.add_value('identifier', identifier)
         loader.add_value('sku', sku)
         loader.add_value('category', category)
         loader.add_value('url', response.url)
         loader.add_value('image_url', response.urljoin(image_url))
         loader.add_value('price', price)
         option_item = loader.load_item()
         yield option_item

Esempio n. 42

0

Mostra file

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        category = hxs.select(u'//div[contains(@class,"Breadcrumbs")]/nobr//span/text()').extract()
        category = u' > '.join(category)

        if hxs.select('//div[@id="ProductContainer9"]'):
            return

        for product in hxs.select(u'//form[@name="productForm"]//div[@itemscope="itemscope"]'):
            loader = ProductLoader(item=Product(), selector=product)

            identifier = product.select(u'.//*[@itemprop="name"]/@id').re(u'ProductTitle-P(\d+)')
            if not identifier:
                identifier = product.select(u'.//meta[@itemprop="productID"]/@content').extract()
            if identifier:
                identifier = identifier[0]
            else:
                continue
            loader.add_value('identifier', identifier)
            sku = product.select(u'.//meta[@itemprop="productID"]/@content')[0].extract()
            sku = re.search(u'(\d+)', sku)
            if sku:
                sku = sku.group(1)
                loader.add_value('sku', sku)
            loader.add_value('url', response.url)
            name = ''.join(product.select(u'.//div[contains(@id,"ProductIntroduction-P")]//text()').extract()).strip()
            if not name or (name and not ('lego' in name.lower())):
                continue
            loader.add_value('name', name)
            price = product.select(u'.//meta[@itemprop="price"]/@content').extract()
            if price:
                price = price[0].strip().replace('.', '').replace(',', '.')
            else:
                price = '0.00'
            loader.add_value('price', price)
            loader.add_value('category', category)

            img = product.select('div/div//a[contains(@id, "ProductThumbnailImage")]/img/@src').extract()
            if not img:
                img = product.select(u'.//a[contains(@id,"ProductThumbnail")]/img/@src').extract()
            if img:
                loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))

            loader.add_value('brand', 'lego')
            yield loader.load_item()

Esempio n. 43

0

Mostra file

File: staples.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        identifier = hxs.select(
            '//div[contains(@class,"skuinfo")]//input[@id="hdfProduto"]/@value'
        ).extract()
        sku = hxs.select(
            '//li[contains(text(),"Fornecedor")]//text()').extract()
        if sku:
            sku = re.search(':(.*)', re.sub('[\r\n\t]', '', sku[0])).group(1)
        else:
            sku = hxs.select(
                '//label[@id="lblRefereniaMBS"]/text()')[0].extract()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        name = hxs.select(
            '//label[@id="lblTituloProduto"]/text()').extract()[0].strip()
        try:
            loader.add_value('name', name)
        except:
            loader.add_value('name', name.decode('utf-8', 'replace'))
        category = hxs.select('//div[@class="n03"]//a/text()').extract()
        loader.add_value('category', ' > '.join(category[:3]))
        image_url = hxs.select('//img[@id="productimage-0"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        brand = hxs.select('//li[contains(text(),"Marca")]//text()').extract()
        if brand:
            brand = re.search(':(.*)', re.sub('[\r\n\t]', '',
                                              brand[0])).group(1)
            loader.add_value('brand', brand)
        loader.add_value('url', response.url)

        price = hxs.select(
            '//label[@id="MainContent_ucPreco_lblProdutoPrecoFinal"]/text()'
        ).extract()

        price = price[0].replace('.', '').replace(
            ',', '.').strip() if price else '0.00'
        loader.add_value('price', price)
        out_of_stock = hxs.select('//span[@class="stock-red"]')
        if out_of_stock:
            loader.add_value('stock', 0)
        price = loader.get_output_value('price')
        if price:
            price = Decimal(price)
            if price <= 48.99:
                loader.add_value('shipping_cost', '3.00')

        metadata = StaplesMeta()
        metadata['exclusive_online'] = 'Yes' if hxs.select(
            '//label[@id="lblTituloProduto"]/font[contains(text(),"Exclusivo Internet")]'
        ) else ''
        delivery_time = hxs.select(
            '//label[@id="lblEntregaPrevista"]/text()').extract()
        metadata['delivery_time'] = delivery_time[0] if delivery_time else ''
        promotion_price = hxs.select(
            '//label[@id="MainContent_ucPreco_lblPrecoProdutoAntes"]/text()'
        ).extract()
        metadata['promotion_price'] = promotion_price[0].replace(
            '.', '').replace(',', '.').replace(u'\u20ac',
                                               '') if promotion_price else ''
        product = loader.load_item()
        product['metadata'] = metadata
        yield product

Esempio n. 44

0

Mostra file

    def parse_searchanise(self, response):
        res = json.loads(response.body)
        items = []
        try:
            items = res['items']
        except KeyError:
            self.log('Wrong response: {}'.format(str(res)))

            retries = response.meta.get('retries', 0)
            if retries < 5:
                time.sleep(60)
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_searchanise,
                              meta={
                                  'offset': response.meta['offset'],
                                  'retries': retries + 1
                              })

        for item in items:
            if not item['product_code']:
                continue
            loader = ProductLoader(item=Product(),
                                   selector=HtmlXPathSelector())
            loader.add_value('identifier', item['product_code'])
            loader.add_value('sku', item['product_code'])
            price = item['price']
            if '.' in price:
                price = price.split('.')
                price = price[0] + '.' + price[1][:2]
            loader.add_value('price', price)
            loader.add_value('name', item['title'])
            loader.add_value('url', item['link'])
            loader.add_value('stock', '1')
            yield loader.load_item()

        if items:
            meta = {'offset': response.meta['offset'] + 99}
            yield Request(self.searchanise_url.format(self.searchanise_api,
                                                      meta['offset']),
                          meta=meta,
                          callback=self.parse_searchanise)

Esempio n. 45

0

Mostra file

File: bikeinn_com.py Progetto: oceancloud82/scraping

    def parse_product(response):

        name = response.xpath('//h1[@class="name"]/text()').extract()[0]
        identifier = response.xpath('//meta[@itemprop="sku"]/@content').extract()[0]
        image_url = response.xpath('//*[@id="zoom_01"]/@src').extract()
        category = response.xpath('//*[@id="wayProd"]//a/span/text()').extract()[-3:]
        price = response.xpath('//*[@id="total_dinamic"]/span/text()').extract()[0]
        price = extract_price(price)

        products = response.xpath('//*[@id="datesBuy"]//select[@name="talla_color"]/option')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            p_name = product.select('./text()').extract()[0]
            p_name = name if p_name == '- ' else name + ' ' + p_name
            p_identifier = product.select('./@value').extract()[0]
            product_loader.add_value('identifier', identifier + '_' + p_identifier)
            product_loader.add_value('name', p_name)
            product_loader.add_value('sku', identifier + '_' + p_identifier)
            if image_url:
                product_loader.add_value('image_url', response.urljoin(image_url[0]))
            product_loader.add_value('price', price)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', 'CamelBak')
            product_loader.add_value('url', response.url)
            product = product_loader.load_item()
            yield product

Esempio n. 46

0

Mostra file

    def parse_category(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        products = hxs.select(self.products_xpath)
        self.log('{} products found'.format(len(products)))
        for p in products:
            loader = ProductLoader(selector=p, item=Product())
            name = p.select(
                './/td[@class="product-title-wrap"]/a/text()').extract()
            if not name:
                continue
            loader.add_value('name', name)
            loader.add_value('stock', 1)
            try:
                url = p.select(
                    './/td[@class="product-title-wrap"]/a/@href').extract()[0]
                url = urljoin_rfc(base_url, url)

                loader.add_value('url', url)
                product_id = p.select(
                    './/input[contains(@name, "[product_id]")]/@value'
                ).extract()[0]
                price_num = p.select('.//span[@class="price-num"]/text()')
                if price_num:
                    price = ''.join(price_num.extract())
                    loader.add_value('price', price)
                    product_url = url
                    if product_id in self.product_info or product_url in self.product_info:
                        p_cache = self.product_info.get(
                            product_id) or self.product_info.get(product_url)
                        loader.add_value('identifier', p_cache['sku'].upper())
                        loader.add_value('sku', p_cache['sku'])
                        self.products += 1
                        yield loader.load_item()
                    else:
                        yield Request(self.get_url(
                            loader.get_output_value('url')),
                                      callback=self.parse_product,
                                      cookies={},
                                      meta={
                                          'proxy': self.get_proxy(),
                                          'loader': loader,
                                          'product_id': product_id,
                                          'dont_merge_cookies': True
                                      })

                else:
                    price_image = p.select(
                        './/span[@class="price"]//img/@src').extract()[0]
                    params = {
                        'url': price_image,
                        'resize': 200,
                        'blur': 1,
                        'mode': '7',
                        'format': 'float'
                    }
                    prev_price = self.product_info.get(product_id, {}).get('price') \
                                 or self.product_info.get(url, {}).get('price')
                    yield Request(self.ocr_url,
                                  method="POST",
                                  body=urlencode(params),
                                  meta={
                                      'loader': loader,
                                      'product_id': product_id,
                                      'price_image': price_image,
                                      'prev_price': prev_price
                                  },
                                  callback=self.parse_price,
                                  dont_filter=True)
            except IndexError:
                continue

        next_category_url = hxs.select(
            '//div[@id="pagination_contents"]//a[@name="pagination"][contains('
            '@class, "next")]/@href').extract()

        retries = response.meta.get('retries', 0)
        if len(next_category_url) > 0 or len(products) > 190:
            page = int(response.meta.get('page', 1)) + 1
            next_url = add_or_replace_parameter(self.search_url, 'page',
                                                str(page))
            yield Request(self.get_url(next_url),
                          callback=self.parse_category,
                          cookies={},
                          dont_filter=True,
                          meta={
                              'proxy': self.get_proxy(),
                              'dont_merge_cookies': True,
                              'page': page
                          })
        elif retries < 3 and (response.status != 200 or not next_category_url
                              or not len(products)):
            page = int(response.meta.get('page', 1))
            next_url = add_or_replace_parameter(self.search_url, 'page',
                                                str(page))
            yield Request(self.get_url(next_url),
                          callback=self.parse_category,
                          cookies={},
                          dont_filter=True,
                          meta={
                              'proxy': self.get_proxy(),
                              'dont_merge_cookies': True,
                              'page': page,
                              'retries': retries + 1
                          })

Esempio n. 47

0

Mostra file

File: blush_spider.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        sku = response.xpath('//span[@itemprop="sku"]/text()').extract()[0]
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)

        loader.add_xpath('brand', '//span[@itemprop="manufacturer"]/text()')

        name = response.xpath(
            '//h1[@itemprop="name"]/text()').extract()[0].strip()
        desc = ''.join(
            response.xpath(
                '//h2[@itemprop="description"]/text()').extract()).strip()
        if desc:
            name = name + ' ' + desc
        loader.add_value('name', name)
        loader.add_value('url', response.url)

        price = extract_price(
            response.xpath('//*[@itemprop="price"]/@content').extract_first())
        if price < 295:
            shipping_cost = 29
        else:
            shipping_cost = 0
        price_before = response.css(
            '.product-main-info .product-price-before::text').extract_first()
        if price_before:
            sales_price = price
            price = extract_price_eu(price_before)
        else:
            sales_price = None
        loader.add_value('price', price)

        image_url = response.xpath(
            '//div[@class="swiper-slide"]/img/@data-src').extract()
        image_url = response.urljoin(image_url[0]) if image_url else ''
        loader.add_value('image_url', image_url)

        breadcrumbs = response.css(
            'nav.breadcrumbs::attr(data-initobject)').extract_first()
        breadcrumbs = json.loads(breadcrumbs)['model']['links'][-3:]
        categories = [category['title'] for category in breadcrumbs]
        if 'Forsiden' in categories:
            categories.remove('Forsiden')
        loader.add_value('category', categories)

        loader.add_value('shipping_cost', shipping_cost)

        item = loader.load_item()
        if sales_price:
            item['metadata'] = {'SalesPrice': sales_price}

        yield item

Esempio n. 48

0

Mostra file

File: lystfiskeren_dk.py Progetto: oceancloud82/scraping

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     url = response.url
     name = hxs.select(
         '//div[@class="product-essential"]//div[@class="product-name"]/h1/text()'
     ).extract()[0]
     sku = ''.join(
         hxs.select(
             u'//div[@class="product-essential"]//div[@class="product-name"]//span[@class="sku"]/text()'
         ).extract()).replace('Vare:', '').strip()
     # price = hxs.select(u'.//div[@class="product-shop"]//span[@class="price"]/text()[last()]').extract()[-1]
     # price = price.strip().replace('.', '').replace(',', '.')
     price = hxs.select(
         "//div[@class='product-essential']//span[@class='regular-price']/span[@class='price']//text()"
     ).extract()
     price += hxs.select(
         "//div[@class='product-essential']//p[@class='special-price']/span[@class='price']//text()"
     ).extract()
     price = price[0]
     price = price.strip().replace('.', '').replace(',', '.')
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_value('name', name)
     loader.add_value('sku', sku)
     loader.add_value('identifier', sku)
     loader.add_value('url', url)
     loader.add_value('price', price)
     yield loader.load_item()
     opthtml = hxs.select(
         '//div[@class="product-essential"]//div[@id="product-options-wrapper"]'
     )
     if opthtml:
         m = re.search('Product.Config\((.+?)\);', opthtml.extract()[0])
         if m:
             m = re.search('rrelse","options":(.+?)]}],', m.group(1))
             if m:
                 sizes = re.findall('label":"([^"]+)"', m.group(1))
                 for sz in sizes:
                     loader = ProductLoader(item=Product(), selector=hxs)
                     loader.add_value('name', name + ' - ' + sz)
                     loader.add_value('sku', sku + '-' + sz)
                     loader.add_value('identifier',
                                      sku + '-' + sz.replace(' ', ''))
                     loader.add_value('url', url)
                     loader.add_value('price', price)
                     yield loader.load_item()

Esempio n. 49

0

Mostra file

    def parse(self, response):
        reader = csv.DictReader(StringIO(response.body))

        for row in reader:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('url',
                                     row['Product page URL'].decode('utf-8'))
            product_loader.add_value('name',
                                     row['Product name'].decode('utf-8'))
            product_loader.add_value('image_url',
                                     row['Image URL'].decode('utf-8'))
            product_loader.add_value('identifier', row['sku'].decode('utf-8'))
            product_loader.add_value(
                'sku', row['Unique product code'].decode('utf-8'))
            product_loader.add_value('price',
                                     str(float(row['Price'].decode('utf-8'))))
            product_loader.add_value('category',
                                     row['Category'].decode('utf-8'))
            product_loader.add_value('brand', row['Brand'].decode('utf-8'))
            product_loader.add_value('shipping_cost', row['Shipping cost'])
            item = product_loader.load_item()
            yield item

Esempio n. 50

0

Mostra file

    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # categories
        category_urls = hxs.select('//ul[@id="nav"]//li/a/@href').extract()
        for url in category_urls:
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        # pages
        pages_urls = hxs.select('//div[@class="pager"]//a/@href').extract()
        for url in pages_urls:
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        # products list
        products = hxs.select("//li[contains(@class, 'item')]")
        if not products:
            logging.error("ERROR!! NO PRODUCTS!! %s " % response.url)
        for product_el in products:
            name = product_el.select('.//h2[@class="product-name"]/a/text()').extract()
            if not name:
                continue

            discountinued = product_el.select('div/div[@class="cant_buy_online"]/p/text()').extract()
            if discountinued:
                continue

            name = name[0]

            url = product_el.select('.//h2[@class="product-name"]/a/@href').extract()
            if not url:
                logging.error("ERROR!! NO URL!! %s %s" % (response.url, name))
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)

            price = product_el.select('.//span[@class="price"]/text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (response.url, name))
                continue
            price = extract_price(price.pop())

            identifier = product_el.select(u'.//div[@class="buy-now"]/a').re(r'/product/(\d+)/form_key')
            if not identifier:
                identifier = product_el.select(u'.//span[contains(@id, "product-price")]/@id').re(r'product-price-(\d+)')

            if not identifier:
                continue
            identifier = identifier.pop()

            loader = ProductLoader(item=Product(), selector=product_el)
            loader.add_value('identifier', identifier)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_xpath('image_url', u'.//a[contains(@class, "product-image")]//img/@src')
            item = loader.load_item()

            self._urls.append(item['url'])
            if identifier not in self.ids or price != self.ids[identifier]:
                self.ids[identifier] = price
                yield item

Esempio n. 51

0

Mostra file

File: worldsoccershop.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        identifier = response.xpath(
            "//div[@class='item-number']/text()").extract_first()
        sku = identifier
        identifier = re.sub(u'a', u'', identifier, flags=re.IGNORECASE)
        name = response.xpath(
            "//div[@class='product-title']/h1/text()").extract_first().strip()
        price = response.xpath(
            "//div[@class='price']//span[@class='disc-price']/text()").extract(
            )
        if not price:
            price = response.xpath(
                "//div[@class='price']/div[@class='regular-price']/span[@class]/text()"
            ).extract()
        if price:
            price = price[0].strip('$').replace(",", "")
        else:
            price = '0.00'
        price = Decimal(price)
        # convert using xe.com
        image_url = response.xpath(
            "//a[@id='mainImage']/img/@src").extract_first()
        categories = response.xpath(
            '//div[@id="breadcrumbs-"]/ul/li/a//text()')[1:-1].extract()
        try:
            brand = response.xpath(
                '//b[contains(., "BRAND:")]/following-sibling::text()[1]'
            ).extract_first().title()
        except AttributeError:
            brand = ''

        attributes = response.xpath('//fieldset[@class="attributes"]//li')
        options = []
        option_names = {}
        for option in response.xpath(
                '//select[@name="attrValue_1"]/option[@value!=""]'):
            opt_val = option.xpath('./@value').extract()
            opt_name = option.xpath('./span/text()').extract()
            if opt_val and opt_name:
                option_names[opt_val[0]] = opt_name[0]
        for attr in attributes:
            attr_name = attr.xpath(
                './/input[@name="attrName_1"]/@value').extract()
            if attr_name:
                attr_name = attr_name[0]
            else:
                continue
            attr_options = []
            attr_values = attr.xpath(
                './/select/option[@value!=""]/@value').extract()
            for attr_value in attr_values:
                attr_options.append((attr_name, attr_value))
            if not attr_values:
                attr_value = attr.xpath(
                    './/input[@name="attrValue_1"]/@value')[0].extract()
                attr_options.append((attr_name, attr_value))
            if attr_options:
                options.append(attr_options)
        options = itertools.product(*options)
        items = []
        for option in options:
            opt = [option_names.get(v, '') for _, v in option]
            opt = [o for o in opt if o]
            option_name = ' '.join(opt).strip()
            opt = [SIZES_DICT.get(o.lower(), o) for o in opt if o]
            option_id = ':'.join(opt).strip()

            option_name = re.sub('size', '', option_name,
                                 flags=re.IGNORECASE).strip()
            size = option_names.get(option[-1][-1],
                                    '') if option and option[-1] else ''
            size = re.sub('size', '', size, flags=re.IGNORECASE).strip()
            if option_name:
                product_name = name + ' (' + option_name + ')'
            else:
                product_name = name
            if option_id:
                product_identifier = identifier + u':' + option_id.strip(
                ).lower()
            else:
                product_identifier = identifier

            loader = ProductLoader(Product(), option)
            loader.add_value('name', product_name)
            loader.add_value('url', response.url)
            loader.add_value('identifier', product_identifier)
            loader.add_value('sku', sku)
            loader.add_value('price', price)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            for category in categories:
                loader.add_value('category', category)

            product = loader.load_item()
            product['metadata'] = {
                'size': SIZES_DICT.get(size.lower(), size).title()
            }
            player = [
                p for p in self.players
                if p[1].lower() in product_name.lower()
            ]
            if player:
                product['metadata']['player'] = player[0][1].title()
                product['metadata']['number'] = player[0][2]

            item = {'item': product}
            item['attributes'] = ()
            for k, v in option:
                item['attributes'] += ((k, v), )
            items.append(item)

        if not options:
            loader = ProductLoader(Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('url', response.url)
            loader.add_value('identifier', identifier)
            loader.add_value('sku', sku)
            loader.add_value('price', price)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            for category in categories:
                loader.add_value('category', category)

            product = loader.load_item()
            product['metadata'] = {}
            player = [p for p in self.players if p[1].lower() in name.lower()]
            if player:
                product['metadata']['player'] = player[0][1].title()
                product['metadata']['number'] = player[0][2]

            item = {'item': product}
            item['attributes'] = ()
            item['attributes'] += ((
                response.xpath(
                    '//input[@name="attrName_1"]/@value')[0].extract(),
                response.xpath(
                    '//input[@name="attrValue_1"]/@value')[0].extract()), )
            item['attributes'] += ((
                response.xpath(
                    '//input[@name="attrName_1"]/@value')[1].extract(),
                response.xpath(
                    '//input[@name="attrValue_1"]/@value')[1].extract()), )
            items.append(item)
        product_id = response.xpath(
            '//input[@name="productId"]/@value')[0].extract()
        yield Request(
            'http://www.worldsoccershop.com/InventoryCheck.json?productId={}'.
            format(product_id),
            meta={'items': items},
            callback=self.parse_stock)

Esempio n. 52

0

Mostra file

    def get_products(self, hxs, url):
        root_url = 'https://www.instawares.com'
        res = []
        products = hxs.select('//ol[starts-with(@class, "productListResultOL")]/li')
        # self.log('%s products found' % len(products))
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/div[@class="listResultsDescriptionDiv"]/a/text()')
            loader.add_xpath('identifier', './/div[@class="listResultsDescriptionDiv"]/dl/dd[1]/text()')
            loader.add_xpath('price', './/div[@class="listResultPrice"]/text()')
            loader.add_xpath('brand', './/div[@class="listResultsDescriptionDiv"]/dl/dt[contains(text(), "By")]/following-sibling::dd/text()')
            url = product.select('.//div[@class="listResultsDescriptionDiv"]/a/@href').extract()[0]
            loader.add_value('url', urljoin_rfc(root_url, url))
            if loader.get_output_value('identifier') in self.prod_data:
                row = self.prod_data[loader.get_output_value('identifier')]
                loader.add_value('brand', row['brand'].decode('utf8'))
                loader.add_value('category', row['category'].decode('utf8'))
                loader.add_value('sku', row['sku'].decode('utf8'))

            image_url = product.select('.//img[@class="productimagelarge"]/@src').extract()
            if image_url:
                image_url = image_url[0]
                loader.add_value('image_url', urljoin_rfc(root_url, image_url))

            p = loader.load_item()
            if p['identifier'] in self.sold_as:
                sold_as = self.sold_as[p['identifier']]
                metadata = TigerChefMeta()
                metadata['sold_as'] = sold_as
                p['metadata'] = metadata

            res.append(loader.load_item())

        if not res and hxs.select('//h1[@class="productName fn"]/text()'):
            loader = ProductLoader(selector=hxs, item=Product(), spider_name=self.name)
            loader.add_value('url', url)
            loader.add_xpath('name', '//h1[@class="productName fn"]/text()')
            loader.add_xpath('price', '//li[@class="price"]//text()')
            loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' +
                                    '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()')
            loader.add_xpath('identifier', '//td[@itemprop="productID"]/text()')

            brand = hxs.select('//td[@class="brand"]/text()').extract()
            if not brand:
                self.log("ERROR brand not found")
            else:
                loader.add_value("brand", brand[0].strip())

            image_url = hxs.select('//div[@class="productImageDiv"]/a/img/@src').extract()
            if not image_url:
                self.log("ERROR image_url not found")
            else:
                loader.add_value("image_url", urljoin_rfc(root_url, image_url[0]))

            category = hxs.select('(//ol[@class="breadcrumbOL"]/a)[last()]/text()').extract()
            if not category:
                self.log("ERROR category not found")
            else:
                loader.add_value("category", category[0].strip())

            sold_as = hxs.select('//dl[@class="soldAsPackedAsDL"]/dd[1]/text()').extract()
            product = loader.load_item()

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].strip() if sold_as else '1 ea'
            product['metadata'] = metadata

            if product.get('identifier'):
                res.append(loader.load_item())

        return res

Esempio n. 53

0

Mostra file

File: lekmer_dk.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        loader = ProductLoader(item=response.meta.get('product', Product()),
                               response=response)

        loader.add_xpath('identifier', '//input[@name="id"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1//text()')
        sku = response.xpath(
            '//div[@class="basic-content-body"]//dt[contains(text(), "Artikelnummer")]'
            '/following-sibling::dd/text()').re(r'(\d{3}\d*)')
        if sku:
            loader.add_value('sku', sku)
        else:
            self.log('No SKU for %s' % (response.url))

        loader.add_xpath(
            'category',
            '//ul[contains(@class, "breadcrumbs")]/li[position()=last()-1]/a/text()'
        )

        img = response.xpath('//img[@itemprop="image"]/@src').extract()
        if img:
            loader.add_value('image_url', response.urljoin(img[0]))

        price = ''.join(
            response.xpath('normalize-space(//*[@itemprop="price"]/text())').
            re(r'([\d.,]+)'))
        loader.add_value('price', extract_price_eu(price))
        loader.add_value('brand', 'Lego')
        in_stock = bool(
            response.xpath(
                '//div[@class="product-info"]//em[@class="mod-success"]//text()'
            ).re(r'lager'))
        if not in_stock:
            loader.add_value('stock', 0)
        yield self.add_shipping_cost(loader.load_item())

Esempio n. 54

0

Mostra file

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select(
            u'//span[@itemprop="name"]/text()').extract()[0].strip()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('identifier',
                                 response.url.split('/')[-1].split('.')[0])
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', name)
        product_loader.add_xpath('brand',
                                 u'//meta[@itemprop="brand"]/@content')
        product_loader.add_xpath('price', u'//span[@itemprop="price"]/text()')
        product_loader.add_value('sku',
                                 response.url.split('/')[-1].split('.')[0])
        product_loader.add_value('category', response.meta.get('category'))
        img = hxs.select(u'//a/img[@class="product-img"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img[0]))

        yield product_loader.load_item()

Esempio n. 55

0

Mostra file

File: leroymerlin.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        base_url = get_base_url(response)

        product_id = hxs.select('//aside/span/span/text()')[0].extract()

        product_loader = ProductLoader(item=Product(), selector=hxs)

        name = hxs.select('//article/header/h1/text()').extract()
        product_loader.add_value('name', u'{}'.format(name[0].strip()))

        product_loader.add_value('url', response.url)

        product_loader.add_value('brand', response.meta.get('brand') or '')

        product_loader.add_value('identifier', '{}'.format(product_id))
        product_loader.add_value('sku', product_id)

        try:
            category = hxs.select(
                '//ul[@class="breadcrumb"]//a/i/text()')[-1].extract()
        except:
            category = hxs.select(
                '//ul[@class="breadcrumb"]//a/text()')[-1].extract()
        product_loader.add_value('category', category)

        image_url = hxs.select(
            '//img[@id="img-01"]/@data-zoom-image').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])
            product_loader.add_value('image_url', image_url)

        price = hxs.select(
            '//aside[contains(@class, "price-container")]/div/p[@class="price"]//text()'
        ).extract()
        product_loader.add_value('price',
                                 extract_price(price[0]) if price else '0.00')

        if not hxs.select(
                '//div[@class="infos-checkout"]/a[contains(@class,"cta green")]'
        ):
            product_loader.add_value('stock', 0)

        weight = hxs.select(
            '//section[@id="description-technique"]//th[@scope="row" and contains(text(),"Poids")]/following-sibling::td/text()'
        ).extract()
        if weight:
            product_loader.add_value('shipping_cost',
                                     self._get_shipping_cost(weight[-1]))

        product = product_loader.load_item()
        metadata = KeterMeta()
        metadata['reviews'] = []
        metadata['brand'] = response.meta.get('brand') or ''
        product['metadata'] = metadata

        reviews_url = 'http://www.leroymerlin.fr/v3/bazaarvoice/viewReviews.do?reflm={}&page={}&maxItems=4'
        yield Request(reviews_url.format(product_id, '1'),
                      meta={
                          'product': product,
                          'page': 1,
                          'product_url': response.url,
                          'product_id': product_id,
                          'reviews_url': reviews_url
                      },
                      callback=self.parse_review,
                      dont_filter=True)

Esempio n. 56

0

Mostra file

File: usn_feed.py Progetto: oceancloud82/scraping

    def parse(self, response):

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username = username, password = password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file(self.file_start_with, files)

        sftp.get(last.filename, self.csv_file_path)

        # Convert XLXS file to CSV
        #excel_to_csv(self.xls_file_path, self.csv_file_path)

        with open(self.csv_file_path) as f:
            reader = UnicodeDictReader(f) # csv.DictReader(f, delimiter=',')
            for row in reader:
                if row['Item Code'].lower() in self.identifiers:
                    continue

                self.identifiers.append(row['Item Code'].lower())
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', row['Item Code'])
                loader.add_value('sku', row['Item Code'])
                loader.add_value('name', row['Product Description'])
                loader.add_value('price', row['RRP'])
                loader.add_value('brand', 'USN')
                loader.add_value('category', row['Category'])
                image_url = self.images.get(row['Item Code'])
                if image_url:
                    loader.add_value('image_url', image_url)
                loader.add_value('url', row['USN Url:'])
                product = loader.load_item()
                metadata = USNFeedMeta()
                metadata['ASIN'] =  row['ASIN'] if row['ASIN'].lower() != 'n/a' else ''
                product['metadata'] = metadata
                yield Request(product['url'], callback=self.parse_details, meta={'product': product, 'option_id': row['Option Value']}, dont_filter=True)

Esempio n. 57

0

Mostra file

File: discountedheating.py Progetto: oceancloud82/scraping

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('name', response.meta['name'])
     loader.add_value('price', response.meta['price'])
     loader.add_value('url', response.url)
     mpn = hxs.select(
         '//*[@id="tab-attribute"]/table/tbody/tr[td/text()="Manufacturers Part No"]/td/text()'
     ).extract()
     sku = hxs.select(
         '//*[@id="tab-attribute"]/table/tbody/tr[td/text()="Act Ref"]/td/text()'
     ).extract()
     if sku:
         loader.add_value('identifier', sku[1])
         loader.add_value('sku', sku[1])
     else:
         if mpn:
             loader.add_value('identifier', mpn[1])
             loader.add_value('sku', mpn[1])
     yield loader.load_item()

Esempio n. 58

0

Mostra file

File: hargrovescycles.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['our part no.'].lower())
        loader.add_value('sku', row['our part no.'])
        loader.add_value('url', response.url)
        brand = hxs.select(
            '//img[@class="manufacturer_image"]/@title').extract()
        brand = brand[0].strip() if brand else ''
        loader.add_value('brand', brand)
        loader.add_value('category', brand)
        loader.add_xpath('image_url',
                         '//div[@class="product-image"]//img/@src')
        loader.add_xpath('name', '//h1/text()')
        price = extract_price(''.join(''.join(
            hxs.select(
                '//form//p[@class="special-price"]//span[@class="price"]/text()'
            ).extract()).split()))
        if not price:
            price = extract_price(''.join(''.join(
                hxs.select(
                    '//span[@class="regular-price"]//span[@class="price"]/text()'
                ).extract()).split()))
        loader.add_value('price', price)

        item = loader.load_item()

        metadata = HargrovesCyclesMeta()
        metadata['mpn'] = row['mpn']

        item['metadata'] = metadata

        option_found = False

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}

            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['oldPrice'])

            for option_id, option_name in products.iteritems():
                # Check for the correct options according to the google doc spreadsheet
                if product_data['products'][option_id]['sku'].upper(
                ) == item['sku'].upper():
                    item['price'] = product_data['childProducts'][option_id][
                        'finalPrice']
                    item['name'] += ' ' + option_name
                    stock = product_data['products'][option_id].get(
                        'saleable', False)
                    if not stock:
                        item['stock'] = 0
                    yield item
        else:
            out_of_stock = hxs.select(
                '//div[contains(@class, "product-info")]//span[@class="stock"]/span[@class="outstock"]'
            )
            if out_of_stock:
                item['stock'] = 0
            yield item

Esempio n. 59

0

Mostra file

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        #pagination
        urls = hxs.select('//*[@id="paging"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse)

        products = hxs.select('//div[contains(@class,"produkt_boks")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            in_stock = product.select(
                './/div[@class="laegikurv"]/a/@class').extract()[0]
            if in_stock != 'laegivogn':
                product_loader.add_value('stock', 0)
            identifier = product.select('.//div[@class="desc"]/a/@href'
                                        ).extract()[0].partition('&vn=')[2]
            product_loader.add_value('identifier', identifier)
            image_url = product.select(
                './/div[@class="produkt_img"]//img/@src').extract()[0]
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url))
            product_name = product.select(
                './/div[@class="desc"]/a/text()').extract()[0]
            product_loader.add_value('name', product_name)
            sku_text = ''.join(
                product.select('.//div[@class="desc"]/text()').extract())
            sku = ''
            for match in re.finditer(r"([\d]+)", sku_text):
                if len(match.group()) > len(sku):
                    sku = match.group()
            product_loader.add_value('sku', sku)
            url = product.select('.//div[@class="desc"]/a/@href').extract()[0]
            product_loader.add_value('url', urljoin_rfc(base_url, url))
            price = product.select('.//span[@class="pris"]/text()').extract(
            )[0].strip().strip('DKK ')
            price = extract_price(price)
            product_loader.add_value('price', price)
            if price < 1000:
                product_loader.add_value('shipping_cost', 49)
            else:
                product_loader.add_value('shipping_cost', 0)
            yield product_loader.load_item()

Esempio n. 60

0

Mostra file

File: tts-group_co_uk.py Progetto: oceancloud82/scraping

    def parse_product(self, response):
        identifier = response.xpath('//*[@id="pid"]/@value').extract_first()
        p_data = json.loads(
            response.xpath('//*[@id="product-data-{}"]/@value'.format(
                identifier)).extract_first())
        name = p_data['variant']
        stock = response.xpath('//*[@id="add-to-cart"]')
        price = p_data['price']
        brand = p_data['brand']
        category = response.xpath(
            '//div[@class="breadcrumb"]//a/span/text()').extract()[1:]
        image_url = response.xpath(
            '//img[@itemprop="image"]/@src').extract_first()

        variations = response.xpath('//div[@class="product-variations"]/ul/li')
        url = response.meta.get('url', '')

        if variations and url == '':
            combined_options = []
            for variant in variations:
                element_options = []
                vtype = variant.xpath('./@class').extract_first()
                if vtype == 'attribute':  # colour
                    vtitle = variant.xpath('./span/text()').extract_first()
                    if 'Select Colour' != vtitle:
                        self.log('Unknown vtitle: {} URL: {}'.format(
                            vtitle, response.url))
                        return
                    for option in variant.xpath(
                            './div/ul/li[@class="available"]'):
                        option_url = option.xpath('./a/@href').extract_first()
                        element_options.append(option_url)
                elif vtype == 'attribute variant-dropdown':
                    for option in variant.xpath(
                            './/select[@class="variation-select"]/option')[1:]:
                        option_url = option.xpath('./@value').extract_first()
                        element_options.append(option_url)
                else:
                    self.log('Unknown vtype: {} URL: {}'.format(
                        vtype, response.url))
                    return
                combined_options.append(element_options)

            if len(variations) > 1:
                combined_options = list(itertools.product(*combined_options))
                for combined_option in combined_options:
                    url = ''
                    for option in combined_option:
                        if url == '':
                            url = option
                        else:
                            params = dict(
                                urlparse.parse_qsl(
                                    urlparse.urlsplit(option).query))
                            for name, value in params.iteritems():
                                url = add_or_replace_parameter(
                                    url, name, value)
                    yield scrapy.Request(url,
                                         callback=self.parse_product,
                                         meta={'url': response.url})
            else:
                for option in combined_options[0]:
                    yield scrapy.Request(option,
                                         callback=self.parse_product,
                                         meta={'url': response.url})

        else:
            if name == '':
                return
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            loader.add_value('category', category)
            if brand != 'Unbranded':
                loader.add_value('brand', brand)
            url = response.meta.get('url', response.url)
            loader.add_value('url', url)
            loader.add_value('image_url', response.urljoin(image_url))
            loader.add_value('price', price)
            if loader.get_output_value('price') <= 10:
                loader.add_value('shipping_cost', '1.50')
            elif loader.get_output_value('price') <= 200:
                loader.add_value('shipping_cost', '5.95')
            if not stock:
                loader.add_value('stock', 0)
            option_item = loader.load_item()
            yield option_item