コード例 #1
0
ファイル: officespot_crawler.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//ol[@id="products-list" and @class="products-list"]//li[contains(@class,"item")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h2[@class="product-name"]/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            sku = product.select(u'.//small[child::b[contains(text(),"Product Code:")]]/text()').extract()
            if sku:
                sku = sku[0].strip()[3:]
            if sku in self.skus:
                product_loader.add_value('sku', sku)
            name = product.select(u'.//h2[@class="product-name"]/a/text()').extract()[0].strip()
            pack_size = product.select(u'.//small[child::b[contains(text(),"Pack Size:")]]/text()').extract()
            if pack_size:
                name += u' x' + pack_size[0].strip() + u'u.'
            product_loader.add_value('name', name)
            price = product.select(u'.//div[@class="price-box"]/span[contains(@class,"regular-price")]/span[@class="price"]/text()').re(u'[\d\.,]+')
            price = re.sub(',', '', price[0])
            product_loader.add_value('price', price)
            yield product_loader.load_item()
コード例 #2
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href')
        if nextPageLink:
            yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products)

        products = hxs.select('//div[@id="center-main"]//div[@class="details"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            loader.add_xpath("name", "a/text()")
            loader.add_xpath("sku", 'div[@class="sku"]/span/text()')

            # few prices were under div class desc
            price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()')
            if price_selector:
                price = price_selector[0].extract()
            else:
                price = "0.0"

            loader.add_value("price", price)

            relative_url = product.select("a/@href")[0].extract()
            loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url))

            yield loader.load_item()
コード例 #3
0
ファイル: justvitamins.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract()
        if name:
            name = name[0].strip()
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            items = hxs.select('//div[@class="Item"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=item)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])

                sku = ''.join(item.select('./text()').extract())
                n = name
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                loader.add_xpath('price', './/span[@class="price"]/text()')
                loader.add_xpath('price', './div[@class="price"]/span/text()')


                yield loader.load_item()
コード例 #4
0
ファイル: testequipmentdepot.py プロジェクト: 0--key/lib
    def parse_products(self, hxs, response):
        print response.encoding
        model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                               ' and text()="Model"]/preceding-sibling::*) + 1').extract()
        description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                     ' and text()="Description"]/preceding-sibling::*) + 1').extract()
        price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                ' and text()="Price"]/preceding-sibling::*) + 1').extract()

        if model_pos and description_pos and price_pos:
            model_pos = model_pos[0].split('.')[0]
            description_pos = description_pos[0].split('.')[0]
            price_pos = price_pos[0].split('.')[0]

            products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \
                                   and not(text()="Model")]/..' % model_pos)
            for product in products:
                loader = ProductLoader(selector=product, item=Product())
                url = response.url
                model_url = product.select('.//td[starts-with(@class, "orderinfo") \
                                            and position()=%s]//a/@href' % model_pos).extract()
                if model_url:
                    url = urljoin_rfc(get_base_url(response), model_url[0])

                loader.add_value('url', url)
                loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos)
                loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos)
                if not loader.get_output_value('price') or not loader.get_output_value('name').strip():
                    continue

                yield loader.load_item()
コード例 #5
0
ファイル: pondsuperstores_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()')
     loader.add_value("url", response.url)
     loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()')
     loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()')
     yield loader.load_item()
コード例 #6
0
ファイル: xpresshomemed.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        base_url = get_base_url(response)
        search_sku = response.meta['sku']
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        name_xpaths = [u'//font[contains(@class,"productname")]/big/text()',
                       u'//font[contains(@class,"productname")]/text()']
        for name_xpath in name_xpaths:
            main_name = hxs.select(name_xpath).extract()
            if main_name:
                main_name = main_name[0].strip()
                break
        if not main_name:
            main_name = response.url
            main_name = re.search(u'.*/(.*)\.htm', main_name)
            if main_name:
                main_name = main_name.groups()[0] + u' (%s)' % search_sku
        options =  hxs.select(u'//td//text()').re(u'PURCHASE OPTIONS: (.*)')
        if options:
            main_name += u' %s' % options[0].strip()
        loader.add_value('name', main_name)
        loader.add_xpath('price', u'//td//font[contains(@class,"pricecolor") and not(ancestor::table[contains(@id,"related")])]/text()')
        loader.add_value('sku', search_sku)

        sku = hxs.select(u'//span[@class="product_code"]/text()').extract()
        if sku:
            sku = re.sub('-', '', sku[0])
            if sku.startswith(search_sku):
                yield loader.load_item()
コード例 #7
0
ファイル: plumbtraders_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     mpn = ''.join(hxs.select('//*[@id="product-information"]/table/tr[th/text()="Part number"]/td/span/text()').extract()).strip()
     loader.add_value('identifier', mpn)
     loader.add_value('name', ' '.join((response.meta['name'].strip(), mpn)))
     loader.add_value('url', response.url)
     loader.add_xpath('price', '//*[@id="product-price"]/p[@class="no-vat"]/text()')
     yield loader.load_item()
コード例 #8
0
ファイル: bosch.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_value('name', response.meta['name'])
     price = hxs.select(u'//div[@id="purchaseProc"]//span/text()').extract()[0]
     loader.add_value('price', price.replace('.', '').replace(',', '.'))
     loader.add_value('sku',response.meta['sku'])
     loader.add_value('url',response.url)
     yield loader.load_item()
コード例 #9
0
ファイル: instawares.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@class="productName fn"]/text()')
        loader.add_xpath('price', '//li[@class="price"]//text()')
        loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' +
                                '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()')

        yield loader.load_item()
コード例 #10
0
ファイル: bosch.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_value('name', response.meta['name'])
     price = hxs.select(u'//table[@class="produktdetails"]//tr/td[preceding-sibling::td[contains(text(),"Cena")]]/text()').extract()[0]
     loader.add_value('price', price.replace(',', '.'))
     loader.add_value('sku',response.meta['sku'])
     loader.add_value('url',response.url)
     yield loader.load_item()
コード例 #11
0
ファイル: bhl_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response) 
     mpn = ''.join(hxs.select('//div[@class="span-4 productcolumn productleftcol"]/h4[text()="Manufacturers Part No:"]/span/text()').extract())
     loader.add_value('identifier', mpn)
     loader.add_value('url', response.url)
     loader.add_value('name', ' '.join((response.meta['name'], mpn)))
     loader.add_value('price', response.meta['price'])
     yield loader.load_item()
コード例 #12
0
ファイル: screwfix_com.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     
     product_loader = ProductLoader(item=Product(), selector=hxs)
     product_loader.add_xpath('name', '//h1[@id="product_description"]/text()')
     product_loader.add_value('price', hxs.select('//p[@id="product_price"]/span/text()').re('(\d+(?:\.\d+))')[0])
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()
コード例 #13
0
ファイル: wasserstrom.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('name', '//h1[@id="partNameId"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip()
        loader.add_value('sku', sku)

        yield loader.load_item()
コード例 #14
0
ファイル: wyeomans_spider.py プロジェクト: 0--key/lib
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     if self.products.has_key(response.url):
         sku = self.products[response.url]
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value('sku', sku)
         loader.add_value('url', response.url)
         loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()')
         loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()')
         return loader.load_item()
コード例 #15
0
ファイル: axminster_co_uk.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()')
     product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()')
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()
コード例 #16
0
ファイル: aquacadabra_spider.py プロジェクト: 0--key/lib
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath("name", "//h1/text()")
        loader.add_value("url", response.url)
        loader.add_xpath("price", '//span[@id="product_price"]/text()')
        loader.add_xpath("sku", '//td[@id="product_code"]/text()')
        yield loader.load_item()
コード例 #17
0
ファイル: wesco_spider.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//span[@id="ProductDetail1_lblDescription"]//text()').extract()
        if name:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('url', response.url)
            loader.add_xpath('price', '//*[@class="yourPriceText"]//text()')
            loader.add_value('sku', response.meta['sku'])
            yield loader.load_item()
コード例 #18
0
ファイル: bosch_diy.py プロジェクト: 0--key/lib
 def parse(self, response):
     hxs = HtmlXPathSelector()
     with open(os.path.join(HERE, 'bosch_uk_diy.csv')) as f:
         reader = csv.DictReader(f)
         for row in reader:
             loader = ProductLoader(item=Product(), selector=hxs)
             loader.add_value('name', unicode(row['name'],'utf-8'))
             loader.add_value('price',row['price'])
             loader.add_value('sku',row['sku'])
             loader.add_value('url',row['bosch'])
             yield loader.load_item()
コード例 #19
0
ファイル: svh24.py プロジェクト: 0--key/lib
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//h1[@itemprop="name"]/text()')
        price = hxs.select(u'//span[@itemprop="price"]/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        loader.add_value('sku', response.meta['sku'])
        yield loader.load_item()
コード例 #20
0
ファイル: amazon.py プロジェクト: 0--key/lib
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@class="buying"]/h1[@class="parseasinTitle"]/span[@id="btAsinTitle"]/text()')
        price = hxs.select(u'//div[@class="buying"]/table[@class="product"]//b[@class="priceLarge"]/text()').extract()[0]
        loader.add_value('price', price.replace(',', '.'))
        loader.add_value('sku', response.meta['sku'])
        yield loader.load_item()
コード例 #21
0
ファイル: cascadebitz_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
 
     items = hxs.select('//table[@class="pricecart"]//tr')
     for item in items:
         loader = ProductLoader(item=Product(), selector=item)
         loader.add_xpath('name', './/span[@class="spanDescription"]/text()')
         loader.add_value('url', response.url)
         loader.add_value('price', item.select('.//td[@class="cellPrice"]/text()').re('Our Price\s+.?(\d+(?:\.\d+))')[0])
         loader.add_value('sku', item.select('.//td[@class="cellAddToCart"]/a/@href').re('pid=([0-9a-f]+)')[0])
         yield loader.load_item()
コード例 #22
0
ファイル: globaltestsupply.py プロジェクト: 0--key/lib
 def parse_products(self, hxs, response):
     products = hxs.select('//table[@id="productCategoriesTable"]//tbody//tr')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/a/strong/text()')
         url = product.select('.//a/strong/../@href').extract()[0]
         loader.add_value('url', urljoin_rfc(get_base_url(response), url))
         if product.select('.//span[@class="red"]/strong/text()'):
             loader.add_xpath('price', './/span[@class="red"]/strong/text()')
         else:
             loader.add_value('price', '0')
         yield loader.load_item()
コード例 #23
0
ファイル: ncoi_nl.py プロジェクト: 0--key/lib
    def parse_course(self, response):
        hxs = HtmlXPathSelector(response)

        path = response.meta['path'][:]
        path.extend(hxs.select(u'//h1/text()').extract())

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('name', u' / '.join((p.strip() for p in path)))
        product_loader.add_value('url', response.url)

        costs = hxs.select(u'//div[@id="kostenspecificatie"]')
        price1 = costs.select(u'.//tr/td[contains(text(),"Cursusgeld")]/../td[position()=2]/text()').extract()
        price2 = costs.select(u'.//tr/td[contains(text(),"Studiemateriaal")]/../td[position()=2]/text()').extract()

        pricetxt = costs.select(u'./p/text()').extract()
        # Just because one course has price in DIVs not single P
        pricetxt.extend(costs.select(u'./div/text()').extract())

        if not price1 or not price2:
            for line in pricetxt:
                if 'Cursusgeld' in line:
                    price1 = [line.split(':')[1].split('(')[0]]

                # Just because one course specifies price with multiple P tags
                elif 'Module C1 en C2' in line:
                    price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)]
                elif 'Inschrijfgeld' in line:
                    price1 = [line.split(':')[1].split('(')[0]]
                elif 'Trainingskosten' in line:
                    price1 = [line.split(':')[1].split('(')[0]]
                elif 'Studiemateriaal' in line:
                    price2 = [line.split(':')[1].split('(')[0]]
                elif 'Trainingsmateriaal' in line:
                    price2 = [line.split(':')[1].split('(')[0]]

            if not price1:
                for line in pricetxt:
                    line = line.strip()
                    if line.startswith(u'\u20ac'):
                        price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)]
                    elif line.startswith('20') and line[4] == ':':
                        price1 = [line.split(':')[1]]
                    elif line.startswith('Opleiding'):
                        price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)]

        try:
            # This seems to be optional
            if not price2:
                price2 = ['0']
            price = float(price1[0].replace(u'\u20ac', '').replace('-', '').replace('.', '').replace(',', '.')) \
                    + float(price2[0].replace(u'\u20ac', '').replace('-', '').replace('.', '').replace(',', '.'))
        except Exception, e:
            logging.error('Bad price [%s] (%s)' % (pricetxt, e))
コード例 #24
0
ファイル: pondplanet_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     
     price = hxs.select('//h1/span[@class="productSpecialPrice"]/text()').extract()
     if(not price):
         price = hxs.select('//td[@align="right"]/h1/text()').extract()
     
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//td[@valign="top" and not(@align="right")]/h1/text()')
     loader.add_value('url', response.url)
     loader.add_value('price', price[0])
     loader.add_xpath('sku', '//input[@name="products_id"]/@value')
     yield loader.load_item()
コード例 #25
0
ファイル: heatandplumb_spider.py プロジェクト: 0--key/lib
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     #loader.add_value('sku', response.meta['mpn'])
     mpn = hxs.select('//div[@class="prod_info_container"]/h1/i/text()').extract()
     if not mpn:
         mpn = hxs.select('//li/span[@itemprop="identifier"]/text()').extract()
     name = ' '.join((response.meta['name'], mpn[0]))
     loader.add_value('identifier', mpn[0])
     loader.add_value('name', name)
     loader.add_value('url', response.url)
     loader.add_xpath('price', '//td[@class="radioPadding" and @width="90" and @bgcolor="#f2f2f2" and @align="center"]/text()')
     yield loader.load_item()
コード例 #26
0
ファイル: gitarhuset.py プロジェクト: 0--key/lib
    def parse_node(self, response, node):
        if not isinstance(response, XmlResponse):
            return

        loader = ProductLoader(item=Product(), selector=node)
        loader.add_xpath('url', u'./product-url/text()')
        loader.add_xpath('name', u'./title/text()')
        price = node.select(u'./price/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        log.msg(json.dumps({'name': loader.get_output_value('name'), 'price': price}))
        if loader.get_output_value('price'):
            return loader.load_item()
        else:
            return Product()
コード例 #27
0
ファイル: jagtdirekt_dk.py プロジェクト: 0--key/lib
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//tr[contains(@class,"product-item")]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()')

            price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0]
            price = price.strip().replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            # If quantity field is not present on page, there are subproducts
            qty = item.select(u'.//input[@name="products_qty"]').extract()
            if qty:
                yield product_loader.load_item()
            else:
                yield Request(url, callback=self.parse_sub)

        level = response.meta.get('level', 1)
        sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

        next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract()
        if next_url:
            next_url = urljoin_rfc(get_base_url(response), next_url[0])
            yield Request(next_url, meta={'level': level})
コード例 #28
0
ファイル: scentiments_spider.py プロジェクト: 0--key/lib
    def parse_designer(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//table//tr[descendant::a]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            url = product.select('.//td[@valign="Middle"]/a[contains(@href,"Product")]/@href')[0]
            loader.add_value('url', urljoin_rfc(base_url, url.extract()))
            loader.add_xpath('name', './/td[@valign="Middle"]/a/span/text()')
            loader.add_xpath('price', './/td/p/b/text()')
#            sku = product.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)')
            loader.add_value('sku', url.re('id=(\d+)')[0])
            yield loader.load_item()
コード例 #29
0
ファイル: cupargardencentre.py プロジェクト: 0--key/lib
 def parse_cat(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     
     product_divs = hxs.select('//div[@class="product-info"]')
     for product in product_divs:
         url = product.select('.//a[@class="product-title"]/@href').extract()[0];
         
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/a[@class="product-title"]/text()')
         loader.add_value('url', urljoin_rfc(base_url, url))
         loader.add_xpath('price', './/span[@class="price"]/span[@id]/text()')
         loader.add_xpath('sku', './/p[@class="sku"]//span[contains(@id,"product_code")]/text()')
         yield loader.load_item()
コード例 #30
0
ファイル: pixmania_tec7.py プロジェクト: 0--key/lib
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//div[@class='box-caracteristic-search']/div[@class='table-wrap']/form/table/tbody/tr")
        for product in products:
            name = product.select("td[@class='prd-details']/h3/a/text()").extract()
            if not name:
                logging.error("ERROR! No name! %s" % response.url)
                continue
            name = name[0]

            url = product.select("td[@class='prd-details']/h3/a/@href").extract()
            if not url:
                logging.error("ERROR! NOT FOUND URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            url = self._urljoin(response, url)

            price = product.select("td[@class='prd-amount-details']/div/p[@class='prd-amount']/strong/text()").extract()
            if not price:
                logging.error("ERROR! NOT FOUND PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
コード例 #31
0
 def parse_product(self, response):
     base_url = get_base_url(response)
     loader = ProductLoader(response=response, item=Product())
     url = response.url
     loader.add_value('url', urljoin_rfc(base_url, url))
     identifier = url.split('/')[4]
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     image_url = response.xpath(
         '//div[@class="prodImg-main"]//img[@class="prodImg"]/@src'
     ).extract_first()
     loader.add_value('image_url', image_url)
     name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0]
     loader.add_value('name', name)
     price = extract_price(
         response.xpath('//meta[@itemprop="price"]/@content').extract()[0])
     loader.add_value('price', price)
     if price < 50:
         loader.add_value('shipping_cost', 2.5)
     categories = response.meta.get('categories')
     categories = response.css('ul.breadcrumbs span::text').extract()[1:-1]
     loader.add_value('category', categories)
     product = loader.load_item()
     yield product
コード例 #32
0
ファイル: walmart_ca.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            product_data = json.loads(
                hxs.select(
                    '//script[contains(text(), "walPP.variantDataRawArr")]/text()'
                ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0]
        except:
            self.errors.append('WARNING: No product data in %s' % response.url)
            return

        price = product_data.get(u'price_store_price', None)
        if not price:
            browser = PhantomJS.create_browser()
            self.log('>>> BROWSER: GET => %s' % response.url)
            browser.get(response.url)
            self.log('>>> BROWSER: OK')
            time.sleep(5)

            hxs = HtmlXPathSelector(text=browser.page_source)

            browser.quit()

            # Monitor all products even without a price (as requested in #248)
            price = '.'.join(
                hxs.select(
                    '//div[@id="pricing"]/div[@class="price-main"]//text()').
                re(r'(\d+)')).strip()
            if not price:
                price_elem = hxs.select(
                    '//span[@id="store-price"][1]/text()').extract()
                if price_elem:
                    price = price_elem[0]
            if not price:
                store_prices = hxs.select(
                    '//div[contains(@id, "store-")]//div[@class="price"]//text()'
                ).extract()
                try:
                    price = '.'.join(
                        re.findall(r'(\d+)', '.'.join(store_prices[:3])))
                except:
                    price = '0.00'
        else:
            price = price[0]

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('category', product_data[u'Category'])
        product_loader.add_value('name', product_data[u'prod_name_en'])
        product_loader.add_value('sku', product_data[u'P_RollupKey'])
        product_loader.add_value('price', price.replace(',', ''))
        product_loader.add_value('identifier', product_data[u'P_UniqueKey'])

        product_loader.add_value('url', response.url)
        product_loader.add_value('brand',
                                 response.meta['brand'].strip().lower())
        product = product_loader.load_item()

        metadata = KeterMeta()
        metadata['brand'] = response.meta['brand']
        metadata['reviews'] = []
        product['metadata'] = metadata
        response.meta['product'] = product

        # the same as canadiantire.ca
        # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en
        # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml
        # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script>
        try:
            part2 = product['sku']
        except:
            self.errors.append('WARNING: No sku in %s' % response.url)
            yield product
        else:
            if not part2:
                self.errors.append('WARNING: No sku in %s' % response.url)
                yield product
            else:
                reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2
                yield Request(reviews_url,
                              meta=response.meta,
                              callback=self.parse_reviews)
コード例 #33
0
ファイル: northsea.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        # inspect_response(response, self)
        # return
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
	
	if not hxs.select('//select[@id="customerTaxType"]/option[@selected="selected"]').re('Excl'):
	  url = hxs.select('//select[@id="customerTaxType"]/option[not (@selected)]/@value').extract()
	  yield Request(urljoin(base_url, url[0]), callback=self.parse_product, dont_filter=True, meta=response.meta)
	  return
	
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('category', response.meta['category'])
        name = ''
        tmp = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        tmp = hxs.select('//div[@class="gallery"]//a[1]/@href').extract()
        if tmp:
            loader.add_value('image_url', tmp[0])
        # Find brand.
        for brand in self.brands:
            if brand.lower() in name.lower():
                loader.add_value('brand', brand)
                break
        # p = loader.load_item()
        tmp = hxs.select('//input[contains(@id,"add-to-cart-button-")]/@data-productid').extract()
        if tmp:
            # identifier = product['identifier']
            loader.add_value('identifier', tmp[0])
        tmp = hxs.select('//p/span[strong="Product Code:"]/text()').extract()
        if tmp:
            loader.add_value('sku', tmp[0].strip())
        tmp = hxs.select('//span[@itemprop="price"]/text()').extract()
        if tmp:
            price = extract_price(tmp[0].strip().split()[0])
            loader.add_value('price', price)

        product = loader.load_item()
        url_post = 'http://www.northseaworkwear.com/addproducttocart/details/%s/1' % product['identifier']
        qty = '1'
        tmp = hxs.select('//input[contains(@class,"qty-input")]/@value').extract()
        if tmp:
            qty = tmp[0]

        selections = hxs.select('//div[@class="attributes"]//select')
        if not selections:
            # loader.add_value('stock', 0)
            # yield loader.load_item()
            formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty}
            self.cookie_num += 1
            yield FormRequest(url_post, formdata=formdata, meta={'item':product, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)
            return

        attrs = []
        for sel in selections:
            attr_name = ''
            tmp = sel.select('@name').extract()
            if tmp:
                attr_name = tmp[0]
            attr_values = []
            for option in sel.select('option'):
                value = ''
                tmp = option.select('@value').extract()
                if tmp:
                    value = tmp[0]
                txt = ''
                tmp = option.select('text()').extract()
                if tmp:
                    txt = tmp[0].strip()
                if value != '' and value != '0':
                    attr_values.append((attr_name, value, txt))
            attrs.append(attr_values)
        # print '### Selections:', attrs
        for option in itertools.product(*attrs):
            # print '### option:', o
            item = copy.deepcopy(product)
            item['name'] += ' - ' + '-'.join([attr[2] for attr in option])
            item['identifier'] += '-' + '-'.join([attr[1] for attr in option])
            # yield item
            formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty}
            for attr in option:
                formdata[attr[0]] = attr[1]
            # print 'formdata:', formdata
            self.cookie_num += 1
            yield FormRequest(url_post, formdata=formdata, meta={'item':item, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)
コード例 #34
0
ファイル: rhinocamera.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brands = set(
            hxs.select(
                '//div[p[contains(span/text(), "Via m")]]/ul/li/a/text()').
            extract())

        loader = ProductLoader(item=Product(), response=response)

        price = hxs.select(
            '//p[contains(@class, "final-price")]/span[@class="bold"]/text()'
        ).extract()[0]
        price = extract_price(price)

        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/span/text()')

        loader.add_value('category', response.meta['category'])
        brand = ''
        for b in brands:
            if loader.get_output_value('name').upper().startswith(b.upper()):
                brand = b
                break

        loader.add_value('brand', brand)

        identifier = url_query_parameter(response.url, "ProductID")
        loader.add_value('sku', identifier)
        loader.add_value('identifier', identifier)
        image_url = hxs.select('//a[@id="Zoomer"]//img/@src').extract()
        image_url = urlparse.urljoin(get_base_url(response),
                                     image_url[0]) if image_url else ''
        loader.add_value('image_url', image_url)

        yield loader.load_item()
コード例 #35
0
    def parse_product(self, response):
        product = json.loads(response.body)

        url = response.meta['url']
        category = product['category']
        brand = product['brand']
        name = product['title']
 
        for option_desc, option in product['variants'].iteritems():
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('identifier', option['id'])
            product_loader.add_value('sku', option['id'])
            product_loader.add_value('image_url', option['imageUrl'])
            product_loader.add_value('name', name + ' ' + option['title'])
            product_loader.add_value('url', url)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('price', option['salesPrice'])
            product_loader.add_value('stock', option['stock'])
            yield product_loader.load_item()
コード例 #36
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        loader.add_xpath(
            'sku',
            'normalize-space(substring-after(//div[@class="sku"]/text(),":"))')
        loader.add_value('category', response.meta.get('category'))
        loader.add_value(
            'price',
            extract_price_eu(''.join(
                hxs.select(
                    '//p[@class="special-price"]//span[@class="price"]/text()'
                ).extract())))
        if not loader.get_output_value('price'):
            loader.add_value(
                'price',
                extract_price_eu(''.join(
                    hxs.select('//span[@class="price"]/text()').extract())))
        loader.add_value('stock', 1)
        img = hxs.select(
            '//div[@class="product-image-gallery"]//img/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        product = loader.load_item()
        options = hxs.select('//a[@data-productid]')
        if options:
            for o in options:
                p = Product(product)
                p['name'] += ' ' + o.select('./@title').extract()[0]
                p['identifier'] = o.select('./@data-productid').extract()[0]
                yield p
        else:
            product['identifier'] = hxs.select(
                '//*[@data-product-id]/@data-product-id').extract()[0]
            yield product
コード例 #37
0
 def parse(self, response):
     reader = csv.DictReader(StringIO(response.body))
     for row in reader:
         loader = ProductLoader(response=response, item=Product())
         loader.add_value('identifier', row["Unique Product ID"])
         loader.add_value('sku', row["Product code"])
         loader.add_value('category',
                          unicode(row["Category"].decode('ISO-8859-1')))
         loader.add_value('name',
                          unicode(row["Product name"].decode('ISO-8859-1')))
         loader.add_value('price', row["Price"])
         loader.add_value('url', row["Product page URL"])
         loader.add_value('brand',
                          unicode(row["Brand"].decode('ISO-8859-1')))
         loader.add_value('image_url', row['Image URL'])
         out_of_stock = row['Stock availability'].upper() != 'IN STOCK'
         if out_of_stock:
             loader.add_value('stock', 0)
         yield loader.load_item()
コード例 #38
0
    def parse_product(self, response):
        json_data = None
        if 'new Product.OptionsPrice(' in response.body:
            d = response.body.split('new Product.OptionsPrice(',
                                    1)[1].split(');', 1)[0]
            json_data = json.loads(d)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)

        sku = response.xpath('//input[@name="product"]/@value').extract()
        if sku:
            sku = sku[0]
        if json_data and json_data.get('productId', None):
            sku = json_data['productId']

        if not sku:
            self.log('WARNING: No product ID => %s' % response.url)
            return

        loader.add_value('identifier', sku)
        loader.add_value('sku', sku)
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')

        if json_data:
            price = str(json_data.get('productPrice', ''))
        else:
            price = response.xpath(
                '//span[@class="price"]/text()').extract()[0]

        if price:
            loader.add_value('price', price)
            loader.add_value('stock', 1)
        else:
            loader.add_value('price', '0.0')
            loader.add_value('stock', 0)

        image_url = response.xpath('//img[@id="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url[0]))
        brand = response.xpath(
            '//div[@class="product-name"]/h2/a/text()').extract()
        if brand:
            loader.add_value('brand', brand[0])

        categories = response.xpath(
            '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()
        if len(categories) > 1:
            loader.add_value('category', categories[1:])

        product = loader.load_item()

        options = response.xpath('//ul[@id="color-swatch-attribute-92"]/li')
        if not options:
            if not product.get('identifier', None):
                self.log('WARNING: No product ID => %s' % response.url)
            else:
                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product
                else:
                    self.log('WARNING: Duplicate product ID => %s' %
                             response.url)
            return

        # process options
        for sel in options:
            item = Product(product)
            opt_id = sel.xpath('@class').extract()
            if opt_id:
                item['identifier'] += '-' + opt_id[0].split()[0].split('-')[-1]
            opt_desc = filter(
                lambda s: s != '',
                map(
                    unicode.strip,
                    sel.xpath('div[@class="tool-tip-description"]/text()').
                    extract()))
            if not opt_desc:
                opt_desc = filter(
                    lambda s: s != '',
                    map(
                        unicode.strip,
                        sel.xpath(
                            'div[@class="tool-tip-description"]/strong/text()'
                        ).extract()))
            if opt_desc:
                item['name'] = product['name'] + ' - ' + ''.join(opt_desc)

            if not item.get('identifier', None):
                self.log('WARNING: No product ID => %s' % response.url)
            else:
                if not item['identifier'] in self.id_seen:
                    self.id_seen.append(item['identifier'])
                    yield item
                else:
                    self.log('WARNING: Duplicate product ID => %s' %
                             response.url)
コード例 #39
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select('//div[@id="listaSezioni"]/div/a/@href').extract()
        for category in categories:
            cat_url = urljoin_rfc(base_url, category)
            yield Request(cat_url)

        sub_categories = hxs.select('//div[@class="contentGruppi"]/div/div[@class="nome"]/a/@href').extract()
        for sub_category in sub_categories:
            cat_url = urljoin_rfc(base_url, sub_category)
            yield Request(cat_url)

        products = hxs.select('//div[@class="articolo"]')
        if products:
            for product in products:
                l = ProductLoader(item=Product(), selector=product)
                #l.add_xpath('name', 'h2/a/b/text()')
                url = product.select('.//h2/a/@href').extract()
                url = urljoin_rfc(base_url, url[0])
                l.add_value('url', url)
                l.add_value('identifier', re.search('art/(\d+)_', url).group(1))
                l.add_xpath('sku', 'p[@class="codfor"]/strong/text()')
                l.add_xpath('brand', 'p[@class="marca"]/img/@alt')
                image_url = product.select('div[@class="img"]/a/img/@src').extract()
                image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
                l.add_value('image_url', image_url)
                category = hxs.select('//div[@class="gruppo"]/text()').extract()[0].strip()
                l.add_value('category', category)
                price = product.select('p[@class="prezzo"]/text()').extract()
                price = extract_price_eu(price[-1]) if price else 0
                l.add_value('price', price)
                if price<=0:
                    l.add_value('stock', 0)
                item = l.load_item()
                yield Request(item['url'], callback=self.parse_product, meta={'item': item})

        next = hxs.select('//a[@class="next"]').extract()
        if next:
            yield Request(urljoin_rfc(base_url, next[-1]))
コード例 #40
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        product_name = hxs.select('//div[@class="boxbody"]/h1/text()[normalize-space()]').extract()
        if not product_name:
            retried = response.meta.get('retried', False)
            if not retried:
                yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)


        product_price = hxs.select('//div[@class="price"]/ins/b/text()').extract()
        product_price = product_price[0] if product_price else None

        if not product_price:
            product_price = re.search('Price=(.*)', response.body)
            if product_price:
                product_price = product_price.group(1).replace('.', '')
            else:
                retried = response.meta.get('retried', False)
                if not retried:
                    yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)

        image_url = hxs.select('//a[@class="img"]/@href').extract()
        out_of_stock = hxs.select('//li[@class="serpontunactive"]')

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@class="boxbody"]/h1/text()[normalize-space()]')
        loader.add_value('url', response.url)
        loader.add_xpath('sku', '//*', re=r'ProductNo=(.*)')
        loader.add_xpath('identifier', '//*', re=r'ProductID=(.*)')
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        loader.add_xpath('category', '//li[@class="current"]/a/text()', lambda e: e[0] if e else '')
        product_price = extract_price(product_price.replace('.', '').replace(',', '.'))
        loader.add_value('price', product_price)
        loader.add_xpath('brand', '//*', lambda e: e[0] if e else '', re=r'Trademark=(.*)')

        item = loader.load_item()

        if not item.get('sku') or not item.get('name'):
            retried = response.meta.get('retried', False)
            if not retried:
                yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product)
                return

        if not item.get('price'):
            item['stock'] = 0

        yield item
コード例 #41
0
    def parse_product(self, response):
        row = response.meta['row']

        name = response.xpath(
            '//h2[@itemprop="name"]/text()').extract()[0].strip()
        colour = response.xpath(
            '//p[@class="common-option variant-ctrl"]/text()').extract()
        if colour:
            name += ' ' + colour[0].strip()

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        image_url = 'http:' + image_url[0] if image_url else ''

        price = ''.join(
            response.xpath(
                '//div[contains(@class, "product-price")]/span[contains(@class, "current")]//text()'
            ).extract())
        price = extract_price(price) if price else ''

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('identifier', '//div[@id="pid"]/@data-product-id')
        loader.add_value('sku', row['SKU'])
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)
        loader.add_xpath('brand', '//h2[@itemprop="brand"]/a/text()')
        categories = response.xpath(
            '//ul[@id="breadcrumbs"]//span/text()').extract()
        loader.add_value('category', categories)
        loader.add_value('name', name)
        loader.add_value('price', price)
        yield loader.load_item()
コード例 #42
0
ファイル: toysrus.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        name = hxs.select('//div[@class="product-title"]/h1/text()').extract()
        if not name:
            self.log('ERROR: no product NAME found! URL:{}'.format(
                response.url))
        else:
            loader.add_value('name', name[0].strip())

        prod_id = hxs.select('//input[@id="productId"]/@value').extract()[0]
        loader.add_value('identifier', prod_id)

        loader.add_value('url', response.url)
        price = hxs.select(
            '//div[@class="price clearfix"]/div[@class="floatleft block"]/span/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//script[contains(text(), "product_base_price")]').re(
                    'product_base_price:\["(.*)"\]')
            if not price:
                self.log('ERROR: no product PRICE found! URL:{}'.format(
                    response.url))
                return
        if price:
            loader.add_value('price', price[0])
        product_image = hxs.select('//a[@id="mainImage"]/img/@src').extract()
        if not product_image:
            self.log('ERROR: no product Image found!')
        else:
            image = urljoin_rfc(get_base_url(response),
                                product_image[0].strip())
            loader.add_value('image_url', image)

        loader.add_value('category', response.meta.get('category', ''))

        sku = hxs.select('//input[@name="skuId"]/@value').extract()
        if not sku:
            self.log('ERROR: no SKU found! URL:{}'.format(response.url))
        else:
            loader.add_value('sku', sku[0].strip())

        brand = re.search('product_brand:\[\"(.*)\"\],', response.body)
        if brand:
            loader.add_value('brand', brand.group(1).strip())

        promo = response.xpath(
            '//div[contains(@class,"pdp_add-cart")]/div[@class="truuk-offer-box"]'
            '//span[@class="truuk-special-offer-body"]/text()').extract()
        if not promo:
            promo = response.xpath(
                '//div[contains(@class,"pdp_add-cart")]//span[@class="was-2 block"]/text()'
            ).extract()

        product = loader.load_item()

        reviews_url = u'http://www.toysrus.co.uk/assets/pwr/content/%s/%s-en_GB-1-reviews.js' % (
            self.calculate_url(prod_id), prod_id)
        metadata = ToyMonitorMeta()
        metadata['reviews'] = []
        if promo:
            metadata['promotions'] = promo[0]
        product['metadata'] = metadata
        meta = {
            'dont_retry':
            True,
            'handle_httpstatus_list': [404, 302],
            'cur_page':
            1,
            'product':
            product,
            'dont_redirect':
            True,
            'reviews_url':
            u'http://www.toysrus.co.uk/assets/pwr/content/' + u'%s/%s' %
            (self.calculate_url(prod_id), prod_id) + u'-en_GB-%s-reviews.js'
        }
        yield Request(reviews_url, meta=meta, callback=self.parse_review)
コード例 #43
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        sku = response.xpath(
            '//div[@id="productInfo"]//dt[@id="About"]/i/text()').extract(
            )[-1].strip()
        if not sku:
            return
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)

        brand = response.xpath('//span[@id="varum"]/text()').extract_first()
        if not brand:
            brand = response.xpath('//span[@class="brand"]/text()').extract()
        loader.add_value('brand', brand)

        name = response.xpath('//b[@itemprop="name"]/text()').extract_first()
        loader.add_value('name', name)
        loader.add_value('url', response.url)

        price = response.xpath(
            '//span[@id="PrisFalt"]/meta[@itemprop="price"]/@content'
        ).extract_first()
        price_before = response.css(
            '.price-rek span#rekPris::text').extract_first()
        if price_before and Decimal(price_before) > Decimal(price):
            sales_price = price
        else:
            sales_price = None
        loader.add_value('price', price)

        image_url = response.css('img#produktbild::attr(src)').extract_first()
        if not image_url:
            image_url = response.xpath(
                '//div[@class="product-image"]/img/@src').extract_first()
        image_url = response.urljoin(image_url) if image_url else ''
        loader.add_value('image_url', image_url)

        categories = response.css('span.breadcrumb a::text').extract()[-3:]
        loader.add_value('category', categories)

        out_stock = response.xpath(
            u'//div[@class="artikel_i_lager"]//span[contains(text(), "Slutt på lager")]'
        )
        if out_stock:
            loader.add_value('stock', 0)
        item = loader.load_item()
        if sales_price:
            item['metadata'] = {'SalesPrice': extract_price(sales_price)}

        options = response.css('div.WrapVar')
        if options:
            if sales_price:
                self.logger.warning('Sales price and options on the %s' %
                                    response.url)
            for option in options:
                option_item = deepcopy(item)
                identifier = option.xpath('.//@id').re('VarList(.*)')[0]
                option_item['identifier'] += '-' + identifier
                price = option.css('div.PT_Pris::text').extract()
                if price:
                    option_item['price'] = extract_price(price[0])
                name = option.xpath('@variant-name').extract_first()
                if name:
                    option_item['name'] += ' ' + name
                image_url = response.xpath('//img[contains(@src, "' +
                                           identifier + '")]/@src').extract()
                if image_url:
                    option_item['image_url'] = urljoin_rfc(
                        get_base_url(response),
                        image_url[0].split('img=')[-1]) if image_url else ''

                stock_data = re.findall('var rubrikartikel = (.*);',
                                        response.body)
                if stock_data:
                    stock_data = json.loads(stock_data[0])
                    for stock in stock_data['varianter']:
                        if stock['artnr'] == identifier:
                            option_item['stock'] = stock['saldo']
                            break
                yield option_item
        else:
            yield item
コード例 #44
0
ファイル: bedworld_net.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(
                '//a[contains(@class,"size-boxes")]/@href').extract():
            yield Request(urljoin_rfc(get_base_url(response), url),
                          callback=self.parse_product)

        product_category = hxs.select(
            '//div[contains(@class,"breadcrumbs")]/ul/li/a/text()').extract(
            )[-1].strip()
        product_name = hxs.select('//h1/text()').extract()[0]

        product_image = hxs.select('//a[@id="zoom-btn"]/@href').extract()
        if product_image:
            product_image = urljoin_rfc(get_base_url(response),
                                        product_image[0])

        product_brand = hxs.select('//img[@class="man-logo"]/@alt').extract()
        product_brand = product_brand[0] if product_brand else ''

        product_sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract()
        product_sku = product_sku[0] if product_sku else ''

        product_config_reg = re.search(
            'var spConfig = new Product.Config\((\{.*\})\);', response.body)
        product_identifier = hxs.select(
            '//input[@name="product"]/@value').extract()[0]

        if product_config_reg:
            products = json.loads(product_config_reg.group(1))
            for identifier, product in products['childProducts'].items():
                product_loader = ProductLoader(item=Product(),
                                               response=response)
                if identifier:
                    product_loader.add_value(
                        'identifier', product_identifier + '-' + identifier)
                else:
                    product_loader.add_value('identifier', product_sku)
                product_loader.add_value('price', product[u'finalPrice'])
                option_name = product_name
                for attr_id, attribute in products[u'attributes'].items():
                    for option in attribute['options']:
                        if identifier in option['products']:
                            option_name += ' ' + option['label']
                product_loader.add_value(
                    'name', re.sub(r' \((.+?)\)', r'', option_name))
                product_loader.add_value('sku', product_sku)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', product_brand)
                product_loader.add_value('category', product_category)
                product_loader.add_value('image_url', product_image)

                if identifier:
                    yield Request('http://www.bedworld.net/oi/ajax/co/?id=' +
                                  identifier + '&pid=' + product_identifier,
                                  meta={'item': product_loader.load_item()},
                                  callback=self.parse_options)
                else:
                    price = product_loader.get_output_value('price')
                    net_price = price / Decimal('1.2')

                    p = product_loader.load_item()
                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    p['metadata'] = meta_

                    yield p
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name',
                                     re.sub(r' \((.+?)\)', r'', product_name))
            product_loader.add_value('sku', product_sku)
            product_loader.add_value('brand', product_brand)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', product_category)
            product_loader.add_value('image_url', product_image)
            price = hxs.select('//span[@id="product-price-' +
                               product_identifier +
                               '"]//text()').re(r'([\d.,]+)')
            price = price[0] if price else 0
            product_loader.add_value('price', price)

            option_elements = []
            dropdown_elements = hxs.select(
                '//select[contains(@class, "product-custom-options")]')
            for dropdown_options in dropdown_elements:
                options = []
                for dropdown_option in dropdown_options.select(
                        'option[@value!=""]'):
                    option = {}
                    option['identifier'] = dropdown_option.select(
                        '@value').extract()[0]
                    option['desc'] = dropdown_option.select(
                        './/text()').extract()[0].split('+')[0]
                    option['price'] = dropdown_option.select(
                        '@price').extract()[0]
                    options.append(option)
                option_elements.append(options)

            final_options = []
            if option_elements:
                combined_options = list(itertools.product(*option_elements))
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + option['desc']
                        final_option['price'] = final_option.get(
                            'price', Decimal(0)) + extract_price(
                                option['price'])
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + '-' + option['identifier']
                    final_options.append(final_option)

            if final_options:
                for opt in final_options:
                    opt_product = product_loader.load_item()
                    opt_product['name'] += ' ' + normalize_space(opt['desc'])
                    opt_product['price'] += opt['price']
                    opt_product['identifier'] += opt['identifier']
                    price = Decimal(opt_product['price'])
                    net_price = price / Decimal('1.2')

                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    opt_product['metadata'] = meta_

                    yield opt_product
            else:
                price = product_loader.get_output_value('price')
                net_price = price / Decimal('1.2')

                p = product_loader.load_item()
                meta_ = Meta()
                meta_['net_price'] = str(net_price)
                p['metadata'] = meta_

                yield p
コード例 #45
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('//h1[@class="name"]/text()').extract()[0]
        identifier = hxs.select('//meta[@itemprop="sku"]/@content').extract()[0]
        sku = hxs.select('//div[@class="detalleMarcaProducto2"]/strong[contains(text(), "Item model number:")]/following-sibling::text()[1]').extract()
        sku = sku[0] if sku else ''
        ean = hxs.select('//div[@class="detalleMarcaProducto2"]/strong[contains(text(), "EAN retail barcodes:")]/following-sibling::text()[1]').extract()
        ean = ean[0].strip() if ean else None
        brand = hxs.select('//*[@id="brandProduct"]/p/a/img/@alt').extract()
        brand = brand[0] if brand else ''
        image_url = hxs.select('//*[@id="zoom_01"]/@src').extract()
        category = hxs.select('//*[@id="wayProd"]//a/span/text()').extract()[-3:]
        price = hxs.select('//*[@id="total_dinamic"]/span/text()').extract()[0]
        price = extract_price(price)

        products = hxs.select('//*[@id="datesBuy"]//select[@name="talla_color"]/option')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            p_name = product.select('./text()').extract()[0]
            p_name = name if p_name == '- ' else name + ' ' + p_name
            p_identifier = product.select('./@value').extract()[0]
            product_loader.add_value('identifier', identifier + '_' + p_identifier)
            product_loader.add_value('name', p_name)
            product_loader.add_value('sku', sku)
            if image_url:
                product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('price', price)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('url', response.url)
            product = product_loader.load_item()
            metadata = ZyroMeta()
            metadata['ean'] = ean
            product['metadata'] = metadata
            yield product
コード例 #46
0
    def parse_product(self, response):
        pdata = SpiderSchema(response).get_product()
        hxs = HtmlXPathSelector(response)

        url = response.url
        l = ProductLoader(item=Product(), response=response)

        name = pdata['name']

        l.add_value('name', name)

        l.add_value('sku', pdata['sku'])

        l.add_value('category', SpiderSchema(response).get_category())

        product_image = response.css(
            'li.active a img::attr(src)').extract_first()
        if product_image:
            l.add_value('image_url', response.urljoin(product_image))

        brand = response.css('.pdp-view-brand-main ::text').extract_first()
        l.add_value('url', url)
        l.add_value('price', pdata['offers']['properties']['price'])
        l.add_value('brand', response.meta.get('brand', brand))
        identifier = response.xpath(
            '//form/input[@name="productId"]/@value').extract_first()
        if not identifier:
            self.log('No identifier found on %s' % response.url)
            return
        l.add_value('identifier', identifier)
        item = l.load_item()

        promotions = response.xpath(
            '//li[@class="pricesale"]/text()').extract()
        promotions += response.xpath(
            '//div[@class="special-offers"]/p/text()').extract()
        promotions = [x.strip() for x in promotions]
        promotions = u' * '.join(promotions)

        metadata = ToyMonitorMeta()
        ean = hxs.select('//li[contains(text(), "EAN")]/text()').re(
            "EAN: ([0-9]+)")
        if ean:
            metadata['ean'] = ean[0]
        metadata['reviews'] = []
        item['metadata'] = metadata
        item['metadata']['promotions'] = promotions

        part_number = response.xpath(
            '//form/input[@name="partNumber"]/@value').extract_first()

        if pdata.get('aggregateRating'):
            review_url = (
                "http://api.bazaarvoice.com/data/reviews.json?Callback=jQuery111206106209812916942_1465931826753"
                "&apiversion=5.4&passkey=q3mz09yipfffc2yhguids3abz&locale=en_GB&Filter=ProductId:%s"
                "&Filter=IsRatingsOnly:false&Include=Products&Stats=Reviews&Limit=100&Offset=0&Sort=SubmissionTime:Desc"
                "&_=1465931826756") % (part_number)
            req = Request(review_url,
                          meta={
                              'item': item,
                              'offset': 0
                          },
                          callback=self.parse_reviews)
            yield req
        else:
            yield item
コード例 #47
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        meta = response.meta

        products = hxs.select('//tr[@class="under_best_match"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', 'td/dl[@class="hproduct"]/dt/a/text()')
            sku = product.select('td/dl/dd[@class="reference_number"]/text()'
                                 ).extract()[0].strip()
            loader.add_value('sku', sku)
            loader.add_value('identifier', sku)
            url = urljoin_rfc(
                get_base_url(response),
                product.select(
                    'td/dl[@class="hproduct"]/dt/a/@href').extract()[0])
            loader.add_value('url', url)
            loader.add_value('brand', meta['brand'])
            loader.add_value('category', meta['category'])
            image_url = product.select(
                'td/dl/dd[@class="product_image"]/a/@style').extract()
            image_url = re.search('(\'.*\')',
                                  image_url[0]).group(1) if image_url else ''
            loader.add_value('image_url', image_url)
            loader.add_xpath(
                'price',
                'td[@class="price_bucket"]/ul/li[@class="total_price"]/text()')
            item = loader.load_item()

            price_was = product.select(
                'td//li[@class="old_price"]/strong[contains(text(), "Was")]/text()'
            ).extract()
            price_was = ' '.join(price_was[0].split()) if price_was else ''
            metadata = JohnLewisMeta()
            metadata['promotion'] = price_was
            item = loader.load_item()
            item['metadata'] = metadata

            if item['price'] < 30:
                item['shipping_cost'] = 4.95

            yield item
        next = hxs.select('//a[@title="Next"]/@href').extract()
        if next:
            url = urljoin_rfc(get_base_url(response), next[0])
            yield Request(url, callback=self.parse_products)
コード例 #48
0
ファイル: amazonuk.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')
        partn = hxs.select(u'//span[@class="tsLabel" and contains(text(),"Manufacturer Part Number")]/following-sibling::span/text()').extract()
        if not partn:
            partn = hxs.select(u'//tr/td[contains(text(),"Manufacturer Part Number")]/following-sibling::td/text()').extract()
        partn = partn[0].strip()
        log.msg('PARTN: [%s == %s]' % (partn.lower(), response.meta['partn'].lower()))
        log.msg('SKU: [%s == %s]' % (partn.lower(), response.meta['sku'].lower()))
        sold_by = hxs.select(u'//div[contains(text(),"Sold by")]/b/text()').extract()
        sold_by = sold_by[0].strip() if sold_by else u''
        log.msg(u'Sold by: %s' % sold_by)
        if (partn.lower() == response.meta['partn'].lower() or partn.lower() == response.meta['sku'].lower()) and sold_by != u'Towequipe':
            loader.add_value('sku', response.meta['partn'])
            loader.add_value('identifier', response.meta['partn'].lower())
            # if loader.get_output_value('price'):
            yield loader.load_item()
        else:
            meta = response.meta
            next_result = meta['next_results']
            if next_result:
                next_result = next_result[0]
                meta['next_results'] = meta['next_results'][1:]
                yield Request(next_result, callback=self.parse_product, meta=response.meta)
            elif meta.get('next_page'):
                next_page = meta['next_page']
                yield Request(next_page, meta=response.meta)
コード例 #49
0
ファイル: allenford.py プロジェクト: oceancloud82/scraping
    def parse_car_details(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_name = hxs.select(
            '//h1/following-sibling::h2/text()').extract()
        product_price = hxs.select(
            './/td[contains(text(), "Cash Price")]//text()').re(r'[\d,.]+')
        product_img = hxs.select(
            '//source[@class="responsive-image"]/@data-placeholder').extract()
        if product_img:
            product_img = urljoin_rfc(base_url, product_img[-1])

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', product_name)
        loader.add_value('name', product_name)
        loader.add_value('price', product_price)
        loader.add_value('url', response.url)
        loader.add_value('image_url', product_img)

        yield loader.load_item()
コード例 #50
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        categories = response.xpath(
            '//li[div[contains(text(), "Audio, vision & technology")]]//a/@href'
        ).extract()
        for category in categories:
            yield Request(response.urljoin(category))

        categories = response.xpath(
            '//div[@id="subCategorycategories"]/ul/li/a/@href').extract()
        categories += response.xpath(
            '//li[@id="categories"]/ul/li/a/@href').extract()
        categories += response.xpath(
            '//div[@class="cat_detail"]/div/a/@href').extract()
        for category in categories:
            url = urljoin_rfc(get_base_url(response), category)
            yield Request(url)

        # products new parse method
        products = response.xpath('//div[contains(@id, "PSPProductList")]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            name = "".join(
                product.xpath(
                    ".//div[contains(@class, 'product_name')]//text()").
                extract()).strip()
            brand = product.xpath(
                'div/a/div[@class="brand_name"]/text()').extract()[0].strip()

            url = product.xpath(".//a/@href").extract()
            url = urljoin_rfc(get_base_url(response), url[0])

            sku = product.xpath(".//div[contains(@id, 'psp')]/@id").re(
                "psp_(.+)")[0]

            price = product.xpath(".//span[@class='price_now']/text()").re(
                u'Now\xa0\xa3(.*)')
            if not price:
                price = product.xpath(
                    ".//span[@class='price-actual' and @itemprop='price']/text()"
                ).extract()

            if price:
                price = price[0]
            else:
                price = ''
                loader.add_value('stock', 0)

            category = response.xpath(
                '//div[@id="box_productSelectionPage"]/div/h1/text()').extract(
                )
            category = category[0].strip() if category else ''

            loader.add_value('name', name)
            loader.add_value('brand', brand)
            #            loader.add_value('category', category)
            loader.add_value('url', url)
            loader.add_xpath('image_url', 'div//img[@class="proImg"]/@src')
            loader.add_value('sku', sku)
            loader.add_value('identifier', sku)
            loader.add_value('price', price)

            item = loader.load_item()
            metadata = DemoRMeta()
            metadata['reviews'] = []
            metadata['promotion'] = ''.join(
                product.xpath(
                    './/span[@class="discount_savings"]/text()').extract())
            item = loader.load_item()
            item['metadata'] = metadata

            yield Request(item['url'],
                          meta={'item': item},
                          callback=self.parse_product)

        for page in response.xpath(
                '//div[@id="pagination"]/a/@href').extract():
            url = urljoin_rfc(get_base_url(response), page)
            yield Request(url)
コード例 #51
0
    def parse_product(self, response):
        identifier = response.xpath(
            '//form[@id="pdAddToCart"]//input[@name="product"]/@value'
        ).extract()
        if not identifier:
            return

        loader = ProductLoader(item=Product(), response=response)
        # Normalize URL
        product_url = url_query_cleaner(response.url,
                                        parameterlist=('content', 'product'),
                                        sep=';')
        loader.add_value('url', product_url)
        loader.add_value('identifier', identifier)
        sku = response.xpath(
            '//table[@class="table-bordered table-striped table-product-datasheet"]'
            '//td[text()="Item Code:"]/following-sibling::td[1]/text()'
        ).extract()
        if sku:
            loader.add_value('sku', sku[0])
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = response.xpath(
            '//div[@class="box-price js-price"]/span[@itemprop="price"]/text()'
        ).extract()
        if price:
            price = extract_price(price[0].strip().replace('.', '').replace(
                ',', '.'))
            loader.add_value('price', price)
        else:
            loader.add_value('price', '0.0')

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        brand = response.xpath(
            '//table[@class="table-bordered table-striped table-product-datasheet"]'
            '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()'
        ).extract()
        if brand:
            loader.add_value('brand', brand[0])

        category = response.xpath(
            '//ul[@class="nav"]//li[contains(@class,"item-active")]/a/text()'
        ).extract()
        if category:
            loader.add_value('category', category)

        availability = response.xpath(
            '//*[@id="js-availability-label"]/text()').extract()
        if availability and 'unknown' in availability[0].lower():
            loader.add_value('stock', 0)

        product = loader.load_item()
        options = response.xpath(
            '//div[@class="input-group input-group-select"]/select')
        if not options:
            if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
               and (product['identifier'] not in self.matched_identifiers)):

                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product

            return

        for sel in options:
            opt = ''
            select_name = sel.xpath('@name').extract()
            if select_name:
                opt = select_name[0].replace('opt_', '')
            for option in sel.xpath('option[@value!="-2"]'):
                item = Product(product)
                opt_id = option.xpath('@value').extract()
                if opt_id:
                    item['identifier'] += '-' + opt + '-' + opt_id[0]
                    item['stock'] = 1
                    opt_stock = option.xpath('@data-av').extract()
                    if opt_stock and opt_stock[0] == '100':
                        item['stock'] = 0
                    opt_name = option.xpath('text()').extract()
                    if opt_name:
                        item['name'] += ' - ' + opt_name[0]

                    if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
                       and (item['identifier'] not in self.matched_identifiers):
                        continue

                    if not item['identifier'] in self.id_seen:
                        self.id_seen.append(item['identifier'])
                        yield item
コード例 #52
0
 def parse_node(self, response, selector):
     loader = ProductLoader(response=response, item=Product())
     loader.add_value('identifier',
                      selector.select('./id/text()').extract())
     loader.add_value('name', selector.select('./name/text()').extract())
     loader.add_value('price', selector.select('./price/text()').extract())
     loader.add_value('category',
                      selector.select('./category/text()').extract())
     loader.add_value('sku', selector.select('./sku/text()').extract())
     loader.add_value(
         'url',
         selector.select('./url/text()').extract()[0].replace(
             'http://', 'https://'))
     loader.add_value(
         'image_url',
         selector.select('./imageurl/text()').extract()[0].replace(
             'http://', 'https://'))
     loader.add_value('brand', selector.select('./brand/text()').extract())
     return loader.load_item()
コード例 #53
0
    def parse(self, response):
        base_url = get_base_url(response)

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username = username, password = password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file("CRC_PRICEFEED_Germany", files)

        date_file = datetime.fromtimestamp(last.st_mtime) 
        hours_diff = (datetime.now() - date_file).total_seconds() / 3600

        # Check file updates
        
        if hours_diff >= 72:
            self.errors.append('WARNING: No Update for 3 days')
        
        zip_path = HERE+'/CRC_PRICEFEED_Germany.zip'
        xml_path = HERE+'/CRC_PRICEFEED_Germany.xml'

        sftp.get(last.filename, zip_path)

        unzip(zip_path, xml_path)

        xmlfeed_sku = ''
        with open(xml_path) as f:
            xmlfeed_sku = f.read()
 
        sku_prices = {}
        tree = et.fromstring(xmlfeed_sku)
        for item in tree.find('priceList[@id="GermanyRP"]').find('prices').findall('price'):
            sku = item.find('skuId').text 
            price = item.find('listPrice').text
            sku_prices[sku] = price

        
        last = get_last_file("PriceMonitorHandler", files)

        zip_path = HERE+'/PriceMonitorHandler.zip'
        xml_path = HERE+'/PriceMonitorHandler.xml'

        sftp.get(last.filename, zip_path)

        unzip(zip_path, xml_path)

        xmlfeed_products = ''
        with open(xml_path) as f:
            xmlfeed_products = f.read()

        sku_products = {}
        tree = et.fromstring(xmlfeed_products)
        for item in tree.find('skus').findall('sku'):
            sku_products[item.find('skuID').text] = {'identifier':item.find('skuID').text, 
                                                     'category': item.find('CategoryDescription').text, 
                                                     'brand':item.find('BrandDescription').text, 
                                                     'image_url': item.find('ImageURL').text, 
                                                     'url': item.find('ProductURL').text, 
                                                     'name': item.find('SkuDescription').text,
                                                     'sku': item.find('skuID').text,
                                                     'stock': item.find('SkuQuantity').text}

        for sku, price in sku_prices.iteritems():
            try:
                product = sku_products[sku]
            except KeyError:
                log.msg('SKU not found:' + sku)
                continue

            product['price'] = price
            product = Product(product)

            loader = ProductLoader(response=response, item=product)
            yield loader.load_item()
コード例 #54
0
ファイル: rituals_fr.py プロジェクト: oceancloud82/scraping
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)

        # sku and identifier
        loader.add_xpath('identifier', "//div[@class='sku']//span[@class='value']//text()")
        loader.add_xpath('sku', "//div[@class='sku']//span[@class='value']//text()")

        # name
        name = ''.join(hxs.select("//h1[@itemprop='name']/text()").extract())
        loader.add_value('name', name.strip())
        #price
        price = extract_price(
            ''.join(hxs.select('//span[@itemprop="price"]/text()').extract()))
        loader.add_value('price', price)
        #stock
        stock = 1
        if not price:
            stock = 0
        loader.add_value('stock', stock)
        #image_url
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        #brand
        # loader.add_xpath('brand', "//div[@class='primary-logo']//img/@alt")
        loader.add_value('brand', 'Rituals')
        #category
        category = hxs.select('//*[@id="add-to-cart"]/@data-category').extract()
        category = category[0] if category else ''
        loader.add_value('category', category)
        #shipping_cost
        loader.add_value('shipping_cost', Decimal(0))

        yield loader.load_item()
コード例 #55
0
ファイル: patrollersupply_com.py プロジェクト: ontiyonke/lib
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//h1/text()').extract()[-1].strip()
        price = hxs.select(
            u'//tr/td//font[starts-with(text(),"$")]/text()').extract()
        if price:
            price = price[0].split()[0]
        else:
            price = hxs.select(u'//tr/td[starts-with(text(),"Price:")]/text()'
                               ).extract()[0].split('$')[-1]

        hxs = HtmlXPathSelector(response)
        category = hxs.select(u'//a[@class="linkHeading"]/text()').extract(
        )[1].split(' - ')[0].strip()

        # For some products name does not change by selecting different options
        name_selected = hxs.select(
            u'//tr/td/select/option[@selected]/text()').extract()
        if name_selected:
            try:
                name += name_selected[0][name_selected[0].index('~') +
                                         1:].strip()
            except:
                #http://www.patrollersupply.com/equipment/item_703.asp only price
                try:
                    name += name_selected[0][name_selected[0].index(' ') +
                                             1:].strip()
                except:
                    pass

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', name)
        product_loader.add_value('price', price)
        product_loader.add_xpath(
            'sku',
            u'//tr/td[contains(text(),"SKU") or contains(text(),"Part #")]/../td[last()]/text()'
        )
        product_loader.add_value('category', category)
        img = hxs.select(
            '//tr/td/img[contains(@src, "products")]/@src').extract()[0]
        img = urljoin_rfc(get_base_url(response), img)
        product_loader.add_value('image_url', img)
        product_loader.add_xpath(
            'brand',
            u'//tr/td[contains(text(),"Manufacturer")]/../td[last()]/a/text()')
        product_loader.add_value('shipping_cost', '')
        yield product_loader.load_item()

        options = hxs.select(u'//tr/td/select/option/@value').extract()
        for opt in options:
            yield Request(
                'http://www.patrollersupply.com/store/cart_item_review.asp?ID='
                + opt,
                callback=self.parse_product)
コード例 #56
0
    def parse_previous_crawl(self, response):
        reader = csv.DictReader(StringIO(response.body))
        for row in reader:
            if row['identifier'] not in self.id_seen:
                self.id_seen.append(row['identifier'])

                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier',
                                 row['identifier'].decode('utf-8'))
                loader.add_value('sku', row['sku'].decode('utf-8'))
                loader.add_value('name', row['name'].decode('utf-8'))
                loader.add_value('price', row['price'])
                loader.add_value('url', row['url'].decode('utf-8'))
                loader.add_value('category', row['category'].decode('utf-8'))
                loader.add_value('brand', row['brand'].decode('utf-8'))
                loader.add_value('image_url', row['image_url'].decode('utf-8'))
                if row['stock']:
                    loader.add_value('stock', int(row['stock']))
                yield loader.load_item()
コード例 #57
0
 def parse(self, response):
     reader = csv.DictReader(StringIO(response.body))
     for row in reader:
         loader = ProductLoader(response=response, item=Product())
         loader.add_value('identifier', row['Luminox Rerence'].lower())
         loader.add_value('sku', row['Luminox Rerence'])
         loader.add_value('brand', row['Brand'])
         loader.add_value('image_url', 'http://' + row['Image'])
         loader.add_value('name', row['Series name'].decode('utf8'))
         loader.add_value('price', row['SRP in USD'])
         yield loader.load_item()
コード例 #58
0
ファイル: google.py プロジェクト: oceancloud82/scraping
    def load_item_(self, item, browser=None, use_adurl=True):
        if browser:
            response = HtmlResponse(url=browser['webdriver'].current_url,
                                    body=browser['webdriver'].page_source,
                                    encoding='utf-8')
        else:
            response = HtmlResponse(url='http://www.google.co.uk/shopping',
                                    body='<html></html>',
                                    encoding='utf-8')
        l = ProductLoader(item=Product(), response=response)
        l.add_value('name', self._try_encoding(item['name']))

        # Item URL
        url = self._try_encoding(item['url'])
        adurl = url_query_parameter(url, 'adurl')
        if adurl and use_adurl:
            item_url = adurl
        else:
            item_url = url

        l.add_value('url', item_url)
        l.add_value('price', item['price'])
        l.add_value('shipping_cost', item.get('shipping_cost', 0))
        l.add_value('dealer', item.get('dealer', ''))
        l.add_value(
            'identifier',
            browser['meta']['identifier'] if browser else item['identifier'])
        l.add_value('sku', browser['meta']['sku'] if browser else item['sku'])

        return l.load_item()
コード例 #59
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(u'//div[@id="list_by_category"]//a/@href').extract():
            url = urljoin_rfc(get_base_url(response), url)
            yield Request(url, callback=self.parse_product_list, meta=response.meta)

        if not hxs.select(u'//div[@id="product_page"]'):
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h2[@id="longname"]/text()')
        if not product_loader.get_output_value('name'):
            product_loader.add_xpath('name', u'//h1/text()')
        product_loader.add_value('category', response.meta.get('category', 'spices'))

        img = hxs.select(u'//div[contains(@class,"image") and contains(@class,"db_content")]/img/@src').extract()
        if not img:
            img = hxs.select(u'//div[contains(@class,"image") and contains(@class,"db_content")]/a/@href').extract()
        if img:
            product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))


#product_loader.add_xpath('brand', '')
#product_loader.add_xpath('shipping_cost', '')

        product = product_loader.load_item()
        for opt in hxs.select(u'//div[@id="product_container"]//form'):
            prod = Product(product)

            prod['sku'] = opt.select(u'.//input[starts-with(@name,"m")]/@name').extract()[0]
            prod['identifier'] = opt.select(u'.//input[starts-with(@name,"m") and @type="text"]/@name').extract()[0]
            prod['name'] = prod['name'] + ' ' + opt.select(u'.//li[@class="product"]/text()').extract()[0].strip()
            prod['price'] = extract_price(opt.select(u'.//li[@class="price"]/text()').extract()[0])
            yield prod
コード例 #60
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brands = hxs.select(
            '//a[contains(@href, "brands/")]/span/text()').extract()

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('sku', '//input[@name="product"]/@value')
        loader.add_value('category', '')
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')

        brand = ''
        for b in brands:
            if b.upper().strip() in loader.get_output_value('name').upper():
                brand = b
                break

        loader.add_value('brand', brand)
        img = hxs.select('//ul[@id="product-page-slider"]//img/@src').extract()
        img = urljoin_rfc(base_url, img[0]) if img else ''
        loader.add_value('image_url', img)
        loader.add_value('url', response.url)
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        item = loader.load_item()

        if not item.get('identifier', None):
            log.msg('Product without identifier, URL: ' + response.url)
            return

        data = re.search('Product.Config\((.*)\);', response.body)
        if data:
            data = data.groups()[0]
            data = json.loads(data)
            product_options = {}
            for attr in data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        product_options[product] = ' - '.join(
                            (product_options.get(product,
                                                 ''), option['label']))

            for option_id, option_name in product_options.iteritems():
                option_item = deepcopy(item)
                try:
                    option_item['price'] = extract_price(
                        data['childProducts'][option_id]['finalPrice'])
                except:
                    option_item['price'] = extract_price(
                        data['childProducts'][option_id]['price'])

                option_item['name'] = option_item['name'] + ' ' + option_name
                option_item[
                    'identifier'] = option_item['identifier'] + '-' + option_id
                yield option_item

        else:
            item['price'] = extract_price(''.join(
                hxs.select(
                    '//form//p[@class="special-price"]//span[@class="price"]/text()'
                ).extract()))
            if not item['price']:
                item['price'] = extract_price(''.join(
                    hxs.select(
                        '//div[@class="product-right"]//span[@class="price"]/text()'
                    ).extract()))
            yield item