Beispiel #1
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@class='product-overview']/h1[1]/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/strong/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return

        price = price[0]

        price2 = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/text()").extract()

        if price2:
            price += price2[0]


        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #2
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        items = hxs.select("//div[@id='content']/div[@id='searchcontent']/div[@class='hasResults']/form/div[@id='switchview']/ol/li")
        for item in items:
            name = item.select("ul/li[@class='producttitle']/h4/a[1]/text()").extract()
            if not name:
                continue
            name = name[0]
            url = item.select("ul/li[@class='producttitle']/h4/a[1]/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select("ul/li[contains(@class, 'pricing')]/ul/li[contains(@class, 'price')]/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #3
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1/text()').extract()[0]
        
        multiple_prices = hxs.select('//select[@class="smalltextblk"]/option/text()').extract()
        single_special_price = hxs.select('//span/text()').re('\xa3(.*[0-9]+)')
        single_price = hxs.select('//td[@class="ProductPrice"]/text()').re('\xa3(.*[0-9])')
        
        products_data = []

        if single_price and not multiple_prices:
            price = single_price[0] if not single_special_price else single_special_price[0]
            products_data.append((name, price))
        else:
            multiple_prices = multiple_prices[1:]
            for name_and_price in multiple_prices:
                name_and_price = re.match('(.*)\xa3(.*\.[0-9]+)', name_and_price).groups()
                products_data.append((name + ' ' + name_and_price[0], name_and_price[1]))

        for item in products_data:
            product = Product()
            loader = ProductLoader(item=product, response=response)
            # try:
            loader.add_value('url', response.url)
            loader.add_value('name', item[0])
            loader.add_value('price', item[1])

            loader.add_value('sku', '')

            yield loader.load_item()
Beispiel #4
0
    def parse_table_options_type2_single_product_page(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='mainContent']/center/table/tr[1]/td[1]/p[2][not(@class)][*[local-name()='strong']]/strong[1]//text()").extract()
        if not name:
            logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
            return
        name = name[0]
        subproducts = hxs.select("//div[@id='mainContent']/center/table//table[@class='product_body']/tr[position()>1]")
        for product_el in subproducts:
            add_name = product_el.select("td[1]//text()").extract()
            if not add_name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            add_name = add_name[0]

            url = response.url

            price = product_el.select('td[3]//text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', "%s %s" % (name, add_name))
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Beispiel #5
0
    def parse(self, response):
        base_url = get_base_url(response)              
        hxs = HtmlXPathSelector(response)
        items = hxs.select("//div[@class='navArea']/div[@class='navAreaPagging fr']/span[@class='paggingBtnNext']/a/@href").extract()
                    
        for item in items:
            yield Request(urljoin_rfc(base_url,item), callback=self.parse)
            
        content = hxs.select("//div[@class='mainProducts']")
        products = content.select(".//a")
                    
        for product_ in products:
    
            name =  product_.select(".//ul/li/span[@class='productName']/text()").extract()
            url = product_.select(".//@href").extract()
            price =  product_.select(".//ul//li/ul/li[1]/span[@class='orange']/text()").re(r'\xa3(.*)')
            if not price:
                price =  product_.select(".//ul/li/ul/li[1]/span[@class='gray']/text()").re(r'\xa3(.*)')
            if name:
                l = ProductLoader(item=Product(), response=response)
                l.add_value('name', name)        
                l.add_value('url', url)
                l.add_value('price', price)
                l.load_item()
                yield l.load_item()            

            
            
        """content = hxs.select("//div[@class='mainProducts']")
Beispiel #6
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        # sub products
        hxs = HtmlXPathSelector(response)

        # multiple prices
        name = hxs.select('//h1/text()').extract()[0]
        multiple_prices = hxs.select('//option/text()').extract()
        single_price = hxs.select('//span/b/text()').re('\xa3(.*)')
        products_data = []
        if not single_price:
            for name_and_price in multiple_prices:
              #  try:
                name_and_price = re.sub('[\t\r\n]', '', name_and_price).strip()
                products_data.append(re.match('(.*[0-9,a-z,A-Z\)]).*\xa3(.*[0-9])', name_and_price).groups())
              #  except AttributeError:
              #      continue
        else:
            price = single_price[0]
            products_data.append((name, price), )

        for item in products_data:
            product = Product()
            loader = ProductLoader(item=product, response=response)
            # try:
            loader.add_value('url', response.url)
            loader.add_value('name', item[0])
            loader.add_value('price', item[1])

            loader.add_value('sku', '')

            yield loader.load_item()
Beispiel #7
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        #name = hxs.select('//div[@id="package_showcase"]/div[@id="description"]/h1/text()').extract()
        name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if not name:
            print "ERROR!! NO NAME!! %s" % url
            return
        name = name[0]

        #price = hxs.select('//div[@id="package_showcase"]/div[@id="pricing"]/strong[last()]/text()').extract()
        price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if not price:
            print "ERROR!! NO PRICE!! %s" % url
            return
        price = price[-1]

        product = Product()
        loader = ProductLoader(item=product, response=response)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)

        loader.add_value('sku', response.url.split('/')[-2])

        yield loader.load_item()
Beispiel #8
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//h1[@class='pageTitle']/span/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = " ".join(name)
        name = re.sub("[\s]+", " ", name)

        price = hxs.select("//div[contains(@class, 'productDetail')]//span[contains(@class, 'currentPrice')]/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = price[0]


        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #9
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)
        # parse pages
        pages = hxs.select("//ul[@class='pagination']//a/@href").extract()
        for page in pages:
            if page != '#':
                request = Request(page, callback=self.parse_search)
                yield request

        # parse products
        items = hxs.select("//article[contains(@class, 'product')]/div[contains(@class, 'desc')]")
        for item in items:
            name = item.select(".//div/header[@class='productTitle']/a/text()").extract()
            if not name:
                continue
            name = name[0].strip()
            name = re.sub("[\s]+", " ", name)

            url = item.select(".//div/header[@class='productTitle']/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select(".//div//span[@class='currentPrice']/ins/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0].strip()

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #10
0
    def parse_options(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)          
        name = hxs.select('//div[@id="skuinfo"]/h1[@itemprop="name"]/text()').extract()
        if not name:
            name = hxs.select('//div[@class="details"]/h1/text()').extract()
        price = "".join(hxs.select('//div[@class="club"]/span[@itemprop="Price"]/text()').re(r'([0-9\,\. ]+)')).strip()
        if not price:
            price = "".join(hxs.select('//div[@class="details"]/div[@class="special"]/text()').re(r'([0-9\,\. ]+)')).strip()
        specs = hxs.select('//div[@id="specs"]/div/p[@class="specs"]')
        model_no = None
        for spec in specs:
            try:
                spec_text = spec.select('./span/text()').extract()[0]
                if spec_text == 'Mfg Part #:':
                    model_no = "".join(spec.select("./text()").extract()).strip()
            except:
                continue
        
        if name and price:
            sku_ = ''
            if model_no:
                csv_file = UnicodeReader(open(os.path.join(HERE, 'skus.csv')))
                for row in csv_file:
                    if row[3] == model_no:
                        sku_ = row[0]
                        break

            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name[0])
            product_loader.add_value('sku', sku_)
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            yield product_loader.load_item()      
Beispiel #11
0
    def parse_item(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select("//tr[@id='ProductDetail11_trProductName']/td/text()").extract()
        if name:
            name = name[0].strip()
            url = response.url
            price = hxs.select("//tr[@id='ProductDetail11_trCustomPrice']/td/font/b/text()").extract()
            if not price:
                price = hxs.select("//tr[@id='ProductDetail11_trPrice']/td/text()").extract()

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', str(name))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
        else:
            # may be several products
            products = hxs.select("//table[@id='SearchTemplate13_DataGrid1']// \
                                     table[@id='SearchTemplate13_DataGrid1__ctl3_ProductInfoTable']")
            for product in products:
                url = product.select("//tr[@id='SearchTemplate13_DataGrid1__ctl3_ProductNameRow']/td/a/@href").extract()
                if url:
                    yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_item)
Beispiel #12
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        # sub products
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="content-box"]/div[contains(@class,"item")]')
        for item in products:
            product = Product()
            price = item.select('.//div[@class="item-price"]').extract()
            url = item.select('.//div[@class="moreinfo"]/a/@href').extract()[0]
            url = urljoin_rfc(self.URLBASE, url)
            if not price:
                yield Request(url)
            else:
                loader = ProductLoader(item=product, response=response)
                try:
                    loader.add_value('url', url)
                    name = item.select('.//div[@class="item-name"]/a/text()').extract()[0]
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    loader.add_value('sku', '')

                    yield loader.load_item()
                except IndexError:
                    continue
Beispiel #13
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        # parse pages
        pages = hxs.select("//div[@class='pagination top']//a/@href").extract()
        for page in pages:
            request = Request(urljoin_rfc(base_url, page), callback=self.parse_search)
            yield request

        # parse products
        items = hxs.select("//div[@class='search-result']/form/ul/li")
        for item in items:
            name = item.select("div[@class='prd-infos']/a/p[@class='prd-name']/strong/text()").extract()
            if not name:
                continue
            name = name[0]
            url = item.select("div[@class='prd-infos']/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select("div[@class='prd-actions']/p[@class='prd-amount']/strong/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #14
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        items = hxs.select("//table[@id='ProductDataList']/tr/td[div[contains(@id, 'ModelLinkCell')]]")
        for item in items:
            name = item.select(".//a[contains(@id, 'ModelLink')]//text()").extract()
            if not name:
                logging.error("ERROR! NO NAME! %s" % response.url)
                return
            name = "".join(name)

            url = item.select(".//a[contains(@id, 'ModelLink')]/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! %s %s" % (name, response.url))
                return
            url = urljoin_rfc(base_url, url[0])

            price = item.select("div[contains(@id, 'ModelPrice')]//td[@class='Label11']/text()").re(u'\xa3(.*)')
            if not price:
                logging.error("ERROR! NO PRICE! %s %s" % (url, name))
                return
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #15
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@class='product-shop']/div[@class='product-name']/h2/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        # adding product
        price = hxs.select("//div[@class='product-shop']/div[@class='price-box']//span[@class='price']/text()").extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(".", "").replace(",", ".")
#        price_delivery = hxs.select("//div[@class='product-shop']//table[@id='product-attribute-specs-table']/tr/td[(preceding::th[text()='Spese Spedizione'])]/text()").extract()
#        if not price_delivery:
#            logging.error("NO PRICE DELIVERY! %s" % url)
#            return
#        price_delivery = price_delivery[0]
#        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #16
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        sku = response.meta['sku']

        name = hxs.select("//h1[contains(@class, 'parseasinTitle')]/span/text()").extract()
        if not name:
            logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url))
            return
        name = name[0].strip()

        price = hxs.select("//table[@class='product']//span[@id='actualPriceValue']/b/text()").extract()
        if not price:
            logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url))
            return
        price = price[0].strip()

        product = Product()
        loader = ProductLoader(item=product, response=response, selector=hxs)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)

        loader.add_value('sku', sku)

        yield loader.load_item()
Beispiel #17
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0]
        price = Decimal(extract_price2uk(price))

        eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract()
        if eco_tax:
            eco_tax[0] = eco_tax[0].encode("ascii", "ignore")
            print "Found eco tax %s" % eco_tax[0]
            price -= Decimal(extract_price2uk(eco_tax[0]))

        l = ProductLoader(item=Product(), response=response)
        l.add_value("identifier", str(name))
        l.add_value("name", name)
        l.add_value("url", url)
        l.add_value("price", unicode(price))
        yield l.load_item()
Beispiel #18
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@id='product-content']//div[@id='product-header']/h1//text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = " ".join(name)

        price = hxs.select(
            "//div[@id='product-content']//div[@id='productPrice']//p[@id='product-price']/text()"
        ).extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = price[0]

        l = ProductLoader(item=Product(), response=response)
        l.add_value("identifier", name)
        l.add_value("name", name)
        l.add_value("url", url)
        l.add_value("price", price)
        yield l.load_item()
Beispiel #19
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='pageContentSub']/div[@class='moduleBox']/\
                             div[@id='top_breadcrumb_link']/a[last()]/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]
                 
        # adding product
        price = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\
                             div[@class='content']/div[@class='details']/ul/li[1]/span[2]/text()").re(u'€ (.*)')
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(",", "")
        price_delivery = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\
                             div[@class='content']/div[@class='details']/ul/li[2]/span[2]/text()").re(u'€ (.*)')
        if not price_delivery:
            logging.error("NO PRICE DELIVERY! %s" % url)
            return
        price_delivery = price_delivery[0].replace(",", "")
        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #20
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        # parse pages
        pages = hxs.select("//ul[@id='pagination']/li/a/@href").extract()
        for page in pages:
            request = Request(page, callback=self.parse_search)
            yield request

        # parse products
        items = hxs.select("//div[@class='column_one grid_list']/div")
        for item in items:
            name = item.select("div/div[@class='info']/div/h2/a/text()").extract()
            if not name:
                continue
            name = name[0]

            url = item.select("div/div[@class='info']/div/h2/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select("div/div[@class='pricebox']/p[@id='product-price']/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value("identifier", name)
            l.add_value("name", name)
            l.add_value("url", url)
            l.add_value("price", price)
            yield l.load_item()
Beispiel #21
0
    def parse_product_list_columns(self, response):
        hxs = HtmlXPathSelector(response)
        products_count = hxs.select("count(//table[@class='product_body']/tr[3]/td)").extract()[0]
        for i in range(1, int(float(products_count))+1):
            name = hxs.select("//table[@class='product_body']/tr[3]/td[%d]/p//text()" % i).extract()
            if not name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            name = name[0]

            url = response.url

            price = hxs.select("//table[@class='product_body']/tr[4]/td[%d]/p[1]/strong[last()]//text()" % i).extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Beispiel #22
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select(
            "//form[@id='handleBuy']/div[@class='buying']/h1[@class='parseasinTitle']/span/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@id='priceBlock']//tr[@id='actualPriceRow']//b[@class='priceLarge']/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = price[0]

        description = u''

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #23
0
    def parse_several_products_single_product_page(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//table[@class='product_body']/tr/td[2]/p[not(@class)][*[local-name()='strong']]")
        for product_el in products:
            name = product_el.select("strong[1]//text()").extract()
            if not name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            name = name[0]

            url = response.url

            price = product_el.select('strong[2]/text() | b[last()]/text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Beispiel #24
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//div[@class="rightcol"]//td[contains(child::text(),"\xa3")] | //div[@class="rightcol"]//td[child::h1]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './a/text()')
            product_loader.add_xpath('name', './h1/text()')
            url = product.select('./a/@href').extract()
            if not url:
                url = response.url
            else:
                url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            price = product.select('./text()').re('\xa3(.*)')
            if not price:
                price = product.select('.//span[@id="_EKM_PRODUCTPRICE"]/text()').extract()
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Beispiel #25
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//h1[@class='product-name']/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]
                 
        # adding product
        price = hxs.select("//div[@class='price-box']//span[@class='price']/text()").re(u'€ (.*)')
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(".", "").replace(",", ".")

        price_delivery = hxs.select("//div[@class='product-shop']/\
            text()[(preceding::div[@class='price-box']) and (following::div[@class='add-to-holder'])]"
        ).re(u'€\xa0([\d,.]*)')
        if not price_delivery:
            logging.error("NO PRICE DELIVERY! %s" % url)
            return
        price_delivery = price_delivery[0].replace(".", "").replace(",", ".")
        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name.encode("ascii", "ignore"))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #26
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@id='productDetail']/form/fieldset/h2/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\
                              span[@class='pounds']/text()").extract()
        if not price:
            price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\
                              span[@class='newPrice']/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! %s %s" % (url, name))
                return
        price = "".join(price)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Beispiel #27
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()
Beispiel #28
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select("//div[@id='dvWrapControl732']//a/@href").extract()
        for link in categories:
            url = urljoin_rfc(base_url, link)
            yield Request(url, callback=self.parse)

        items = hxs.select("//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\
                            //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']")
        for item in items:
            name = item.select("td[@id='tdProductGroupDisplayDescription']/div/font | \
                                td[@id='tdProductGroupDisplayAltDescription']/div/font").extract()
            if not name:
                print "%s - ERROR! NO NAME!" % response.url
                continue
            name = replace_tags(name[0])
            url = response.url
            price = item.select("td[@id='tdProductGroupDisplayPricing']//text() | \
                                 td[@id='tdProductGroupDisplayAltPricing']//text()").extract()
            if not price:
                print "%s - ERROR! NO PRICE!" % response.url
                continue
            price = price[0].split(',')[0]
            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', unicode(name).encode('ascii', 'ignore'))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #29
0
    def parse(self, response):
        URL_BASE = get_base_url(response)
        #categories
        hxs = HtmlXPathSelector(response)
        categories_title = hxs.select('//div[@id="secondNav"]/div[@class="catList"]/dl/dt[1]/text()').extract()
        if categories_title and categories_title[0].strip().lower() == "by category":
            categories = hxs.select('//div[@id="secondNav"]/div[@class="catList"]/dl/dd')
            for link in categories:
                url = link.select(".//a/@href").extract()[0]
                url = urljoin_rfc(URL_BASE, url)
                url += self.pager_url_arguments
                yield Request(url)
                link_class = link.select("@class").extract()
                if link_class and link_class[0] == "last":
                    break

        pages_urls = hxs.select('//span[@clas="pagingTools"]/a/@href').extract()
        for url in pages_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url)

        products_els = hxs.select('//li[@class="galleryProduct"]')
        for product_el in products_els:
            name = product_el.select('div[@class="galleryContainer"]/a/span/text()').extract()
            if not name:
                print "ERROR!! NO NAME!! %s" % response.url
                continue
            name = name[0].split(" - Home Delivered")

            url = product_el.select('div[@class="galleryContainer"]/a/@href').extract()
            if not url:
                print "ERROR!! NO URL!! %s" % response.url
                continue
            url = url[0]
            url = urljoin_rfc(URL_BASE, url)

            price = product_el.select(
                'div[@class="galleryContainer"]/div[@class="productInfo"]/\
                 div[@class="productPriceBlock"]/p/span[@class="nowPrice"]/strong/text() |\
                 div[@class="galleryContainer"]/div[@class="productInfo"]/\
                 div[@class="productPriceBlock"]/p/span[@class="onlyPrice"]/text()'
            ).extract()
            if not price:
                print "ERROR!! NO PRICE!! %s" % response.url
                continue
            price = price[0]

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)

            loader.add_value('sku', '')

            yield loader.load_item()
    def parse_mattel_product(self, response):

        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        product_name = hxs.select(
            '//div[@class="product-details"]/h2/text()').extract()
        if not product_name:
            return
        product_name = product_name[0]
        product_price = hxs.select(
            '//*[@id="product-information"]//span[@class="promotion-now"]/text()'
        ).extract()[0]
        product_identifier = response.url.partition('productId=')[2]
        brand = 'Mattel'
        image_url = hxs.select('//*[@id="mainProductImage"]/@src').extract()
        category = response.meta.get('category')
        sku = hxs.select('//span[@class="item-number"]/text()').extract()
        sku = sku[0].replace('Item #: ', '')

        a = re.search(r'skus: {\s+(.*)},\s+availableSizes', response.body,
                      re.DOTALL | re.IGNORECASE)
        a = '{' + a.groups()[0].strip() + '}'
        a = a.replace("'", '"')
        lines = a.split('\n')
        result = ''
        for line in lines:
            if ': "' in line:
                for field in mattel_fields:
                    if field + ':' in line:
                        result += line.replace(field, '"' + field + '"')
                        break
            else:
                result += line
        options = json.loads(result)
        for option_id, option in options.iteritems():
            loader = ProductLoader(response=response, item=Product())
            identifier = product_identifier + '_' + option_id
            loader.add_value('identifier', identifier)
            price = option.get('price').strip()
            if price == '':
                price = product_price
            price = extract_price(price)
            loader.add_value('price', price)
            loader.add_value('brand', brand)
            loader.add_value('sku', sku)
            loader.add_value('url', response.url)
            name = product_name
            if option.get('color').lower().strip() != 'one color':
                name += ', ' + option.get('color')
            if option.get('size').lower().strip() not in [
                    'one size', 'one style'
            ]:
                name += ', ' + option.get('size')
            loader.add_value('name', name)
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            loader.add_value('category', category)
            if price > 35:
                loader.add_value('shipping_cost', 0)
            yield loader.load_item()
Beispiel #31
0
    def parse(self, response):
        #inspect_response(response, self)
        #return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)

        tmp = hxs.select(
            '//div[@id="col2"]//input[contains(@id,"product_minidetail_")]/@value'
        ).extract()
        if tmp:
            loader.add_value('identifier', tmp[0])
            loader.add_value('sku', tmp[0])
        else:
            log.msg('### No product ID at ' + response.url, level=log.INFO)
            return
        #tmp = hxs.select('//input[@name="productId"]/@value').extract()
        #if tmp:
        #    loader.add_value('sku', tmp[0])
        name = ''
        tmp = hxs.select(
            '//div[@id="col2"]//h1[@class="titre"]/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        else:
            log.msg('### No name at ' + response.url, level=log.INFO)
        #price
        price = 0
        stock = 0
        tmp = hxs.select(
            '//div[@id="col2"]//span[@class="prix"]/text()').extract()
        if tmp:
            price = extract_price(tmp[0].strip())
            loader.add_value('price', price)
            #stock = 1
        #stock
        #stock = 0
        tmp = hxs.select('//div[@id="col2"]//span[text()="Add to basket"]')
        if tmp:
            stock = 1
        loader.add_value('stock', stock)
        #image_url
        tmp = hxs.select(
            '//div[@id="col1"]//div[contains(@class,"product")]/img/@src'
        ).extract()
        if tmp:
            url = urljoin(response.url, tmp[0].strip())
            loader.add_value('image_url', url)
        #brand
        tmp = hxs.select(
            '//div[@id="col2"]//td[@class="catName"]/a/text()').extract()
        if tmp:
            loader.add_value('brand', tmp[0].upper())
        #category
        tmp = hxs.select('//div[@id="breadcrumb"]/h2/a/text()').extract()
        if tmp:
            for s in tmp:
                loader.add_value('category', s)
        #shipping_cost
        if price <= 26:
            loader.add_value('shipping_cost', 3.6)
        #elif price<50:
        #    loader.add_value('shipping_cost', 5.95)

        product = loader.load_item()
        metadata = YMeta()
        tmp = hxs.select(
            '//div[@id="col2"]//div[@class="promo"]/img/@alt').extract()
        if tmp:
            metadata['promotions'] = []
            for s in tmp:
                s = s.replace('picto-', '')
                metadata['promotions'].append(s)
            metadata['promotions'] = ','.join(metadata['promotions'])
        if metadata:
            product['metadata'] = metadata

        return product
Beispiel #32
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)

        product_id = response.xpath(
            '//form[@name="SelectProductForm"]/input[@name="product_id"]/@value'
        ).extract()
        if product_id:
            identifier = product_id[0]
        else:
            identifier = response.url.split('/')[-1]

        loader.add_value('identifier', identifier.split('?')[0])

        name = filter(
            lambda n: n,
            map(unicode.strip,
                response.xpath('//h1[@itemprop="name"]//text()').extract()))
        if not name:
            name = filter(
                lambda n: n,
                map(
                    unicode.strip,
                    response.xpath(
                        '//h1[contains(@class,"product-name")]//text()').
                    extract()))
        if name:
            loader.add_value('name', name[0].strip())

        loader.add_value('brand', response.meta['brand'])

        categories = response.xpath(
            '//div[@itemprop="breadcrumb"]//span[@itemprop="title"]/text()'
        ).extract()
        if not categories:
            categories = response.xpath(
                '//div[@itemprop="breadcrumb"]//span[@itemprop="name"]/text()'
            ).extract()
        if categories:
            loader.add_value('category', categories[0])
        elif 'category' in response.meta:
            loader.add_value('category', response.meta['category'])

        sku = response.xpath(
            '//table[@class="SpecTable"]//td[text()="Model No.:"]/following-sibling::td/text()'
        ).extract()
        if not sku:
            sku = response.xpath(
                '//div[@class="specs-table"]//td[text()="Model No.:"]/following-sibling::td/text()'
            ).extract()
        if not sku:
            sku = response.xpath(
                '//div[contains(@class, "specs-table")]//td[text()="Model No.:"]/following-sibling::td/text()'
            ).extract()
        if sku:
            loader.add_value('sku', sku[0].strip())
        loader.add_value('url', response.url)

        price = response.xpath(
            '//div[@id="WM_PRICE"]//*[contains(@class,"camelPrice")]/span/text()'
        ).extract()
        if not price:
            price = response.xpath(
                '//div[@class="onlinePriceMP"]//*[contains(@class,"camelPrice")]/span/text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@itemprop="offers"]/div[contains(@class, "product-price")]//*[@itemprop="price"][1]//text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@class="col5"]//div[contains(@class,"product-buying-table-row")][1]//div[contains(@class,"price-display")][1]//text()'
            ).extract()
        if not price:
            price = response.xpath('//*[@itemprop="price"]//text()').extract()
        if not price:
            price = response.xpath('//@data-product-price').extract_first()
            price = [price] if price else []

        price = ''.join(price).strip() if price else '0'

        price = extract_price(price)
        loader.add_value('price', price)

        out_stock = response.xpath(
            '//div[@id="OnlineStat" and @class="OutOfStock"]')
        if not out_stock:
            out_stock = response.xpath(
                '//p[@class="price-oos" and text()="Out of stock"]')
        if not out_stock:
            out_stock = response.xpath(
                '//div[@id="OnlineStat" and @class="OnlineNotSold"]')
        if out_stock:
            loader.add_value('stock', 0)
        else:
            loader.add_value('stock', 1)

        image = response.xpath(
            '//div[@class="LargeItemPhoto215"]//img/@src').extract()
        if not image:
            image = response.xpath(
                '//div[contains(@class,"product-images")][1]//img/@src'
            ).extract()
        if image:
            loader.add_value('image_url', image[0])

        product = loader.load_item()
        metadata = HamiltonMeta()
        metadata['brand'] = product['brand'].strip().lower()
        metadata['reviews'] = []
        product['metadata'] = metadata

        if True:
            productid = response.url.split('/')[-1].split('.')
            url = 'https://www.walmart.com/reviews/product/%s?page=1' % productid[
                0]
            yield Request(url,
                          meta={
                              'product': product,
                              'page': 1,
                              'productid': productid[0]
                          },
                          callback=self.parse_reviews)
        else:
            yield product
Beispiel #33
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        category = hxs.select('//div[@id="bCrumb"]/span/a/text()').extract()
        category = category[-1] if category else response.meta.get(
            'category', '')

        colours = hxs.select(
            '//select[@id="cphMain_ddlColour"]/option[@value!="0"]/@value'
        ).extract()
        no_option_selected = hxs.select(
            '//select[@id="cphMain_ddlColour"]/option[@value="0" and @selected]/@value'
        )
        if colours and no_option_selected:
            for colour in colours:
                formdata = {}
                inputs = hxs.select('//form[@id="frmMain"]//input')
                for input in inputs:
                    name = ''.join(input.select('@name').extract())
                    value = ''.join(input.select('@value').extract())
                    formdata[name] = value
                formdata['ctl00$cphMain$ddlColour'] = colour
                form_url = hxs.select(
                    '//form[@id="frmMain"]/@action').extract()[0]
                yield FormRequest(form_url,
                                  dont_filter=True,
                                  method='POST',
                                  formdata=formdata,
                                  callback=self.parse_product,
                                  meta={
                                      'category': category,
                                      'colour': colour
                                  })
            return

        sizes = hxs.select(
            '//select[@id="cphMain_ddlSize"]/option[@value!="0"]/@value'
        ).extract()
        no_option_selected = hxs.select(
            '//select[@id="cphMain_ddlSize"]/option[@value="0" and @selected]')
        if sizes and no_option_selected:
            for size in sizes:
                formdata = {}
                inputs = hxs.select('//form[@id="frmMain"]//input')
                for input in inputs:
                    name = ''.join(input.select('@name').extract())
                    value = ''.join(input.select('@value').extract())
                    formdata[name] = value

                formdata['ctl00$cphMain$ddlSize'] = size
                colour = response.meta.get('colour', None)
                if colour:
                    formdata['ctl00$cphMain$ddlColour'] = colour
                form_url = hxs.select(
                    '//form[@id="frmMain"]/@action').extract()[0]
                yield FormRequest(form_url,
                                  dont_filter=True,
                                  method='POST',
                                  formdata=formdata,
                                  callback=self.parse_product,
                                  meta={
                                      'category': category,
                                      'formdata': formdata
                                  })
            return

        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = hxs.select('//div[@class="code"]/text()').extract()[0]
        loader.add_xpath('sku', '//div[@class="code"]/text()')
        loader.add_value('url', response.url)
        product_name = hxs.select(
            '//div[@class="title"]//h1/text()').extract()[0]

        colour = hxs.select(
            '//span[@id="cphMain_lblSelectedColour"]/b/text()').extract()
        if colour:
            product_name = product_name + ' - ' + colour[0].strip()

        loader.add_value('category', category)
        img = hxs.select('//img[@id="cphMain_imgThumb"]/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_xpath('brand', '//span[@class="brand"]/text()')
        loader.add_value('stock', '1')
        if loader.get_output_value('price') < 50.00:
            loader.add_value('shipping_cost', '4.95')
        else:
            loader.add_value('shipping_cost', '0')

        price = hxs.select('//span[@class="price"]/text()').extract()

        if colours or sizes:
            colour = hxs.select(
                '//select[@id="cphMain_ddlColour"]/option[@selected and @value!="0"]'
            )

            option_price = None
            if colour:
                colour_id = colour.select('@value').extract()[0]
                colour_desc = colour.select('text()').extract()[0]
                identifier = identifier + '-' + colour_id
                product_name = product_name + ' - ' + colour_desc.split(
                    u' - \xa3')[0].strip()
                option_price = re.search(r"\xa3(\d+.\d+)", colour_desc)

            size = hxs.select(
                '//select[@id="cphMain_ddlSize"]/option[@selected and @value!="0"]'
            )
            if size:
                size_id = size.select('@value').extract()[0]
                size_desc = size.select('text()').extract()[0].strip()
                identifier = identifier + '-' + size_id
                colour = hxs.select(
                    '//span[@id="cphMain_lblSelectedColour"]/b/text()'
                ).extract()
                product_name = product_name + ' - ' + size_desc

            loader.add_value('identifier', identifier)
            loader.add_value('name',
                             product_name.replace(' - Collect Only', ''))

            if option_price:
                loader.add_value('price', option_price.group(1))
            else:
                loader.add_value('price', price)
        else:
            loader.add_value('identifier', identifier)
            loader.add_value('name',
                             product_name.replace(' - Collect Only', ''))
            loader.add_value('price', price)

        yield loader.load_item()
Beispiel #34
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        #name = hxs.select('//div[@id="package_showcase"]/div[@id="description"]/h1/text()').extract()
        name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if not name:
            print "ERROR!! NO NAME!! %s" % url
            return
        name = name[0]

        #price = hxs.select('//div[@id="package_showcase"]/div[@id="pricing"]/strong[last()]/text()').extract()
        price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if not price:
            print "ERROR!! NO PRICE!! %s" % url
            return
        price = price[-1]

        product = Product()
        loader = ProductLoader(item=product, response=response)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)

        loader.add_value('sku', response.url.split('/')[-2])
        loader.add_value('identifier', response.url.split('/')[-2])

        yield loader.load_item()
Beispiel #35
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        #image_url = hxs.select('//div[contains(@class, "b-main-image")]/a/@href').extract()
        image_url = hxs.select(
            '//img[@itemprop="image"]/@data-frz-src').extract()
        product_identifier = hxs.select(
            '//input[@name="sku"]/@value').extract()[0]
        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()').extract()[0].strip()
        price = hxs.select(
            '//div[@id="auto_show_prime_price"]/strong/span[contains(@class, "actualPrice")]/text()'
        ).extract()[0]
        price = extract_price(price)
        category = hxs.select(
            '//ul[@class="b-breadcrumb"]//a/text()').extract()[1:]
        brand = hxs.select('//span[@itemprop="brand"]/text()').extract()
        brand = brand[0] if brand else ''

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url', image_url[0])
        product_loader.add_value('sku', product_identifier)
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)
        product = product_loader.load_item()
        yield product
Beispiel #36
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = "".join(
            map(
                lambda x: x.strip(),
                hxs.select(
                    '//div[@id="primary_block"]/h1/descendant-or-self::text()'
                ).extract()))
        if name.startswith("LEGOLAND"):
            return

        category = hxs.select('//div[@class="breadcrumb "]/a/text()').extract()
        if category:
            category = category.pop()
        else:
            category = ""

        pid = hxs.select('//input[@name="id_product"]/@value').extract()

        sku = hxs.select(
            '//label[@for="product_reference"]/following-sibling::span[1]/text()'
        ).extract()
        if not sku:
            sku = pid
        elif sku[0].endswith("-lego"):
            sku = sku.pop()[0:-5]

        try:
            price = self.parse_price(
                hxs.select('//p[@class="our_price_display"]/strong/span/text()'
                           ).pop().extract())
        except IndexError:
            return

        stock = hxs.select('//p[@id="pQuantityAvailable"]/span[@class="yes"]')

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            loader.add_xpath('image_url',
                             '//div[@id="image-block"]/span/img/@src',
                             Compose(lambda v: urljoin(base_url, v[0])))
            loader.add_value('price', price)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            if not stock:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" %
                               urljoin(base_url, response.url))
Beispiel #37
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        name = hxs.select('//*[@id="content"]//h3/text()').extract()[0].strip()
        price = hxs.select(
            '//*[@id="content"]/div[1]/div[2]/p[4]/text()').extract()
        price = hxs.select('//*[@id="content"]/div[1]/div[2]/p[3]/text()'
                           ).extract() if not price else price
        price = extract_price(price[0].strip().replace(
            u' K\u010d', '').replace(',', '.').replace(' ', ''))
        sku = hxs.select(
            "//p[contains(text(),'Objednac') and contains(text(),'slo:')]/following::p[1]/text()"
        ).extract()[0]
        sku = sku[2:] if sku.startswith('22') else sku

        identifier = hxs.select(
            '//div[@class="detail-koupit"]/form/@action').extract()[0]
        identifier = identifier.partition('volba=')[2]
        availability = hxs.select(
            '//*[@id="content"]/div[1]/div[2]/div[1]/img/@alt').extract(
            )[0].strip()
        category = hxs.select('//*[@id="content"]/h2/text()').extract()
        image_url = 'http://www.mikifun.cz' + hxs.select(
            '//div[@id="content"]//a[@class="highslide"]/img/@src').extract(
            )[0]

        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value('image_url', image_url)
        loader.add_value('price', price)
        loader.add_value('sku', sku)
        loader.add_value('identifier', identifier)
        loader.add_value('brand', 'LEGO')

        if category:
            loader.add_value('category', category[0])

        if availability != 'Skladem':
            loader.add_value('stock', 0)

        if int(price) <= 3000:
            loader.add_value('shipping_cost', 100)

        yield self.load_item_with_metadata(loader.load_item())
 def parse_product(self, response):
     if not isinstance(response, HtmlResponse):
         return
     hxs = HtmlXPathSelector(response)
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     product_loader.add_xpath(
         'price',
         '//div[@class="club"]/span[@itemprop="Price"]/text()',
         re='.*\$(.*[0-9])')
     product_loader.add_value('url', response.url)
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('identifier', response.meta['sku'].lower())
     if not product_loader.get_output_value('price'):
         return
     mfrgid = response.meta['mfrgid']
     if product_loader.get_output_value('name'):
         site_mfrgid = hxs.select(
             u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()'
         ).extract()
         site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None
         name = response.meta['name'].split(' ')
         if site_mfrgid and (mfrgid == site_mfrgid.strip()
                             or site_mfrgid in name):
             return product_loader.load_item()
Beispiel #39
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        image_url = response.xpath('//div[@id="productImages"]//img[@itemprop="image"]/@src').extract()

        product_loader = ProductLoader(item=Product(), response=response)
        identifier = re.findall('"rid":(.*)};', response.body)
        if not identifier:
            return

        identifier = identifier[0]
        product_loader.add_value('identifier', identifier)
        product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        if image_url:
            product_loader.add_value('image_url', 'http:' + image_url[0])
        product_loader.add_value('sku', identifier)
        price = response.xpath('//span[@id="productPrice"]/span/text()').extract()
        if not price:
            price = response.xpath('//meta[@property="og:price:amount"]/@content').extract()

        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', response.meta['categories'])
        product_loader.add_xpath('brand', '//meta[@property="og:brand"]/@content')
        out_of_stock = response.xpath('//div[@id="productOptions"]/span[@class="sold-out"]')
        if out_of_stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        yield product
Beispiel #40
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        cats = hxs.select(
            u'//div[@id="RightColumn"]/table/tr/td/center/div[@class="contentsName"]/a/@href'
        ).extract()
        if cats:
            for url in cats:
                if url.split('.')[-1].lower() not in ('htm', 'html'):
                    # Contains links to PDFs as well
                    continue
                url = urljoin_rfc(get_base_url(response), url)
                yield Request(url, callback=self.parse_product_list)
        else:
            opt_groups = []

            def fix_options(what, o):
                try:
                    return (what + ':' + o[0], o[1].replace(',', ''))
                except:
                    return (what + ':' + o[0], '0')

            for option in hxs.select(u'//div[@class="eyOptions"]//select'):
                what = option.select(u'./@name').extract()[0]
                opt_list = option.select(
                    u'./option[@value!="PleaseSelect" and @value!="Please Select"]/text()'
                ).extract()
                opt_list = [o.replace(')', '').split('(') for o in opt_list]
                opt_groups.append([fix_options(what, o) for o in opt_list])

            for opt_name, opt_price in multiply(opt_groups):
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('url', response.url)
                product_loader.add_xpath('name', u'//h1/text()')
                if hxs.select(u'//div[@class="bigSalePrice"]'):
                    product_loader.add_xpath(
                        'price',
                        u'//div[@class="bigSalePrice"]/span/font/text()')
                elif hxs.select(u'//span[@class="bigSalePrice"]'):
                    product_loader.add_xpath(
                        'price', u'//span[@class="bigSalePrice"]/font/text()')
                else:
                    product_loader.add_xpath(
                        'price',
                        u'//div[@class="itemRegPrice"]/span/font/text()')

                product_loader.add_xpath(
                    'sku',
                    u'normalize-space(substring-after(//div[@class="code"]/text(),":"))'
                )
                product_loader.add_xpath(
                    'category', u'//div[@class="eyBreadcrumbs"]/a[2]/text()')
                product_loader.add_xpath('image_url',
                                         u'//img[@id="SwitchThisImage"]/@src')
                #            product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")')
                product_loader.add_value('shipping_cost', '')

                product = product_loader.load_item()
                product['name'] = (product['name'] + ' ' + opt_name).strip()
                product['price'] = product['price'] + Decimal(opt_price)
                yield product
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_identifier = hxs.select(
            '//input[@name="product"]/@value')[0].extract()
        sku = ''
        product_name = hxs.select(
            '//div[@class="product-name"]/span/text()')[0].extract().strip()
        base_price = response.xpath(
            '//p[@class="special-price"]/span[@class="price"]/text()').extract(
            )
        if not base_price:
            base_price = response.xpath(
                '//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        base_price = extract_price(base_price[0]) if base_price else 0
        #cart_price = hxs.select('//div[@class="cartBoxTotal"]/text()').extract()
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        category = hxs.select(
            '//span[@typeof="v:Breadcrumb"]/a/text()').extract()
        category = category[-1] if category else ''
        brand = hxs.select(
            '//ul[@id="productDetailsList"]/li[contains(text(),"Manufactured")]/text()'
        ).re('Manufactured by: (.*)')

        options = hxs.select(
            '//select[@class=" required-entry product-custom-option"]/option')
        data_config = response.xpath('//script/text()').re(
            'new Product.Config\((.+)\);')
        if options:
            for option in options:
                identifier = option.select('./@value').extract()
                if not identifier or identifier[0] == '':
                    continue
                else:
                    identifier = identifier[0]
                option_name = option.select('./text()').extract()[0]
                option_name = option_name.split(u'+\xa3')[0].strip()
                name = product_name + " " + option_name
                price = extract_price(option.select('@price').extract()[0])

                identifier = product_identifier + "-" + identifier
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', identifier)
                loader.add_value('sku', product_identifier)
                loader.add_value('price', base_price + price)
                loader.add_value('brand', '')
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('image_url', image_url)
                loader.add_value('category', category)
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)
                yield loader.load_item()
            return

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('identifier', product_identifier)
        loader.add_value('sku', product_identifier)
        loader.add_value('url', response.url)
        loader.add_value('name', product_name)
        loader.add_value('image_url', image_url)
        loader.add_value('brand', brand)
        loader.add_value('category', category)
        loader.add_value('price', base_price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)
        item = loader.load_item()

        if data_config:
            data = json.loads(data_config[0])['attributes']
            products = dict()
            for attribute in sorted(data):
                for option in data[attribute]['options']:
                    for product in option['products']:
                        if not products.get(product):
                            products[product] = dict()
                            products[product]['label'] = option['label']
                            products[product]['price'] = extract_price(
                                option['price'])
                        else:
                            products[product]['label'] += ' ' + option['label']
                            products[product]['price'] += extract_price(
                                option['price'])
            for product in products:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', products[product]['label'])
                loader.replace_value('identifier',
                                     product_identifier + '-' + product)
                loader.replace_value('sku', product)
                loader.replace_value('price',
                                     base_price + products[product]['price'])
                yield loader.load_item()
            return

        yield item
Beispiel #42
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select('//div[@id="ResultSetItems"]'):
            for x in self.parse(response):
                yield x
            return

        first_name = ' '.join(
            hxs.select('//*[@id="itemTitle"]/text()').extract()).strip()
        if not first_name:
            return

        identifier = response.url.split('?')[0].split('/')[-1]

        try:
            category = hxs.select(
                '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop()
        except:
            category = ''
        seller_id = ''.join(
            hxs.select('.//*[@class="si-content"]'
                       '//a/*[@class="mbg-nw"]/text()').extract())
        try:
            brand = hxs.select(
                '//*[@class="attrLabels" and contains(text(), "Brand")]'
                '/following-sibling::*/text()').extract()[0].strip()
        except:
            brand = ''

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('name', first_name)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category', category)
        product_loader.add_value('dealer', 'eBay - ' + seller_id)
        product_loader.add_value('brand', brand)
        product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src')
        product_loader.add_value('url', response.url)
        try:
            price = hxs.select(
                '//*[@id="prcIsum"]/text()').extract()[0].strip()
        except:
            try:
                price = hxs.select(
                    '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip()
            except:
                try:
                    price = re.search(r'"binPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
                except:
                    price = re.search(r'"bidPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
        product_loader.add_value('price', extract_price(price))

        # shipping cost
        try:
            shipping_cost = hxs.select(
                '//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    product_loader.add_value('shipping_cost', 0)
                else:
                    product_loader.add_value('shipping_cost',
                                             extract_price(shipping_cost))
        except:
            pass

        product_ = product_loader.load_item()

        options_variations = []

        sel = HtmlXPathSelector(text=response.body.replace('&quot;', ''))
        try:
            json_var_map = unicode(
                sel.select('//*/text()').re(r'("menuItemMap":{.*}.*),'
                                            '"unavailableVariationIds"')[0])
        except:
            pass
        else:
            #json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map)
            variations = json.loads(
                '{' +
                re.sub(r',"unavailableVariationIds".*', '', json_var_map) +
                '}')

            menu_map = variations['menuItemMap']

            for key, variation in variations['itemVariationsMap'].items():
                if variation['traitValuesMap']:
                    new_variation = {}
                    for option, value in variation['traitValuesMap'].items():
                        new_variation[option] = menu_map[str(
                            value)]['displayName']
                    options_variations.append({
                        'price':
                        variation['price'],
                        'values':
                        new_variation,
                        'identifier':
                        '%s:%s' % (identifier, key)
                    })

        if options_variations:
            for model in options_variations:
                model_name = first_name + ' ' + \
                    ' '.join(opt_name.strip().lower()
                             for o, opt_name in model['values'].items())
                new_product = Product(product_)
                new_product['name'] = model_name
                new_product['identifier'] = model['identifier']
                new_product['price'] = extract_price(model['price'])

                yield new_product
        else:
            yield product_
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        options = hxs.select('//select[@id="variant-select-size"]/option[text()!="-- Please select --"]/@value').extract()
        options += hxs.select('//select[@id="variant-select-colour"]/option[text()!="-- Please select --"]/@value').extract()

        for option in options:
            url = urljoin_rfc(base_url, option)
            yield Request(url, callback=self.parse_product)

        try:
            sku = hxs.select('//p[@id="brandAndPartNos"]/text()').extract()[-1].strip()
        except:
            retry = int(response.meta.get('retry', 0))
            if retry < 10:
                retry += 1
                new_meta = response.meta.copy()
                new_meta['retry'] = retry
                yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True)
            return

        if sku or not options:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_id = hxs.select('//input[@name="productId"]/@value').extract()[0]
            name = hxs.select('//h1[@class="skuHeading"]/strong/text()').extract()[0]
            ext_name = ' '.join(hxs.select('//h1[@class="skuHeading"]/text()').extract()).strip()
            category = hxs.select('//div[@class="breadcrumb"]/nav/p/a/text()').extract()[-1]
            image_url = hxs.select('//img[@class="productImageLarge"]/@src').extract()
            if image_url:
                image_url = urljoin_rfc(base_url, image_url[0])
            brand = hxs.select('//img[@class="brandImageMedium"]/@alt').extract()
            brand = brand[0].replace(' logo', '') if brand else ''

            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('category', category)
            product_name = name + ext_name

            brand_in_name = False
            for w in re.findall('([a-zA-Z]+)', product_name):
                if w.upper() in brand.upper():
                    brand_in_name = True

            if brand.upper() not in product_name.upper() and not brand_in_name:
                product_name = brand + ' ' + product_name

            product_loader.add_value('name', product_name)
            product_loader.add_value('url', response.url)
            product_loader.add_value('identifier', product_id)

            product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            discontinued = hxs.select('//p[contains(@class, "stock")]/span[@class="discontinued"]')
            if discontinued:
                # Does not include discontinued items
                return
            stock = hxs.select('//span[@class="inStock"]/strong/text()').extract()
            add_button = hxs.select('//input[contains(@class, "ajaxBuyButton")]')
            if stock:
                product_loader.add_value('stock', extract_price(stock[0]))
            elif add_button:
                product_loader.add_value('stock', 1)
            else:
                product_loader.add_value('stock', 0)
            price = hxs.select('//strong[@id="price_"]/text()').extract()[0]
            price = extract_price(price)
            if price < 50:
                product_loader.add_value('shipping_cost', 4.50)
            else:
                product_loader.add_value('shipping_cost', 0)

            product_loader.add_value('price', price)
            product_loader.add_value('image_url', image_url)
            yield product_loader.load_item()
Beispiel #44
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@class='primary-content']//div[@id='product-summary']/h1/text()").extract()

        if not name:
            name = hxs.select('//h1/text()').extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = "".join(price)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
    def parse_products2(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        for href in hxs.select(
                '//table[@id="tblContent"]//td[@class="leftPane"]//a/@href'
        ).extract():
            url = urlparse.urljoin(base_url, href)
            if url not in self.visited_urls:
                yield Request(url, callback=self.parse_products2)
                self.visited_urls.add(url)

        for href in hxs.select(
                '//ul[@id="pMenuSublevelsl1"]//a/@href').extract():
            url = urlparse.urljoin(base_url, href)
            if url not in self.visited_urls:
                yield Request(urlparse.urljoin(base_url, href),
                              callback=self.parse_products2)
                self.visited_urls.add(url)

        for product_box in hxs.select(
                '//div[@id="ShopContent"]//div[@class="plistAreaHeader"]/div'):

            tabular = product_box.select('.//table[@class="Tabular"]')
            if tabular:
                for pbox in tabular.select("./tbody/tr"):
                    product_loader = ProductLoader(item=Product(),
                                                   selector=pbox)

                    product_loader.add_xpath('name', './td[2]/a/text()')
                    product_loader.add_value(
                        'url',
                        urlparse.urljoin(
                            base_url,
                            pbox.select('./td[2]/a/@href').extract()[0]))
                    product_loader.add_value(
                        'price',
                        pbox.select('./td[4]/a/text()').extract()[0].split(" ")
                        [-1].replace(".", "").replace(",", "."))
                    product = product_loader.load_item()
                    if product['url']: yield product
                continue

            elements = product_box.select('.//div[@class="prelement"]')
            if elements:
                for pbox in elements:
                    product_loader = ProductLoader(item=Product(),
                                                   selector=pbox)

                    product_loader.add_xpath(
                        'name', './/div[@class="prmain"]/a[1]/text()')
                    product_loader.add_value(
                        'url',
                        urlparse.urljoin(
                            base_url,
                            pbox.select('.//div[@class="prmain"]/a[1]/@href').
                            extract()[0]))
                    product_loader.add_value(
                        'price',
                        pbox.select(
                            './/div[@class="prbasket"]/p[@class="prpri"]/text()'
                        ).extract()[0].split(" ")[-1].replace(".", "").replace(
                            ",", "."))
                    product = product_loader.load_item()
                    if product['url']: yield product

            elif product_box.select('.//div[@class="prbasket"]'):
                product_loader = ProductLoader(item=Product(),
                                               selector=product_box)

                product_loader.add_xpath('name', './a[1]/text()')
                product_loader.add_value(
                    'url',
                    urlparse.urljoin(
                        base_url,
                        product_box.select('./a[1]/@href').extract()[0]))
                product_loader.add_value(
                    'price',
                    product_box.select('.//div[@class="prbasket"]/p/text()').
                    extract()[0].split(" ")[-1].replace(".",
                                                        "").replace(",", "."))
                product = product_loader.load_item()

                if product['url']: yield product
Beispiel #46
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        # parse pages
        pages = hxs.select("//div[@class='pagination-link']//a/@href").extract()
        for page in pages:
            request = Request(urljoin_rfc(base_url, page), callback=self.parse_search)
            yield request

        # parse products
        items = hxs.select("//li[contains(@class, 'product')]")
        for item in items:
            name = item.select("div[@class='product-details']/div[contains(@class, 'product-name')]/h3/a/text()").extract()
            if not name:
                continue
            name = name[0]

            url = item.select("div[@class='product-details']/div[contains(@class, 'product-name')]/h3/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)

            price = item.select("div[@class='product-details']/div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pounds']/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            price2 = item.select("div[@class='product-details']/div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pence']/text()").extract()
            if price2:
                price += "." + price2[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Beispiel #47
0
    def _parse_compound_product(self, response):
        hxs = HtmlXPathSelector(response)
        main_name = hxs.select('//h1[@itemprop="Name"]//text()').extract()[0]
        skus = hxs.select('//div[@id="details"]//b[contains(text(),' +
                          '"Item Numbers")]/following-sibling::text()').extract()
        
        if skus:
            skus = skus[0]
            skus = [sku.strip() for sku in skus.split(',')]
        for i, option in enumerate(hxs.select('//select[@id="item_number"]/option[contains(text(), "$")]/text()').extract()):
            loader = ProductLoader(response=response, item=Product())
            name, price = option.split('-')
            loader.add_value('name', main_name.strip() + ' ' + name.strip())
            loader.add_value('price', price)
            loader.add_value('url', response.url)
            if len(skus) > i:
                loader.add_value('sku', skus[i])

            yield loader.load_item()
Beispiel #48
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0]
        brand = 'Lego'

        try:
            image_url = urljoin_rfc(base_url,
                                    hxs.select('//div[@id="prod-media-player"]'
                                               '//img/@src').extract()[0].strip())
        except IndexError:
            image_url = ''

        options = hxs.select('//div[@id="prod-multi-product-types"]')

        if options:
            products = options.select('.//div[@class="product-type"]')
            for product in products:
                opt_name = product.select('.//h3/text()').extract()[0].strip()
                try:
                    stock = product.select('//div[contains(@class, "mod-stock-availability")]'
                                           '//p/strong/text()').re(r'\d+')[0]
                except IndexError:
                    stock = 0

                loader = ProductLoader(item=Product(), selector=product)
                sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract()
                if not sku:
                    sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract()
                if sku:
                    loader.add_value('sku', sku[0].strip())
                loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()')
                loader.add_value('name', '%s %s' % (name, opt_name))
                loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()')
                loader.add_value('image_url', image_url)
                loader.add_value('brand', brand)
                loader.add_value('url', response.url)
                loader.add_xpath('price', './/p[@class="price"]/strong/text()')
                loader.add_value('stock', stock)
                yield loader.load_item()

        else:
            price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip()
            if not price:
                price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).strip()
                if not price:
                    price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).strip()

            try:
                stock = hxs.select('//div[contains(@class, "mod-stock-availability")]'
                                   '//p/strong/text()').re(r'\d+')[0]
            except IndexError:
                stock = 0

            loader = ProductLoader(item=Product(), response=response)
            sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract()
            if not sku:
                sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract()
            if sku:
                loader.add_value('sku', sku[0].strip())
            loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()')
            loader.add_value('name', name)
            loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()')
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('url', response.url)
            loader.add_value('price', price)
            loader.add_value('stock', stock)

            item = loader.load_item()

            if item.get('identifier'):
                yield item
Beispiel #49
0
    def parse(self, response):
        pages = set(
            response.xpath(
                '//*[contains(@class, "pagination__item")]/a[not(contains(@class, "pagination__current"))]/@href'
            ).extract())
        for page_url in pages:
            yield Request(response.urljoin(page_url), meta=response.meta)

        products = response.xpath(
            '//article[@itemtype="http://schema.org/Product"]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.xpath(
                './/*[@itemprop="brand"]//*[@itemprop="name"]/text()').extract(
                )[0].strip()
            if brand.upper() in brands_substitute:
                brand = brands_substitute[brand.upper()]
            full_name = product_el.xpath(
                './/*[contains(@class, "product__title") and @itemprop="name"]/text()'
            ).extract()[0]
            try:
                tyre_size, name = re.split(brand, full_name, flags=re.I)
            except ValueError:
                self.log(
                    "[[TESTING]] Can not split tyre '%s' with brand '%s'" %
                    (full_name, brand))
                continue
            # tyre_size, name = full_name.split(brand)
            loader.add_value('name', name)

            winter_tyre = product_el.xpath(
                './/*[@class="product__info"]//*[@data-icon="S" and contains(text(), "Winter")]'
            )
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = self.get_identifier(product_el)

                out_of_stock = product_el.xpath(
                    './/*[@itemprop="availability" and contains(@content, "Out")]'
                )
                if out_of_stock:
                    loader.add_value('stock', 0)

                loader.add_value('url', response.url)

                image_url = product_el.xpath(
                    './/img[@itemprop="image"]/@src').extract()

                if image_url:
                    loader.add_value('image_url',
                                     response.urljoin(image_url[0]))

                loader.add_value('identifier', identifier)
                price = product_el.xpath('@data-price').extract()[0]
                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                res = parse_pattern(tyre_size)
                if not res:
                    continue
                width, ratio, rim, load_rating, speed_rating = res
                metadata['aspect_ratio'] = ratio
                metadata['rim'] = rim
                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating
                metadata['width'] = width

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="XL"]'))
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat_found = is_run_flat(full_name)
                run_flat = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="RF"]'))
                if not run_flat:
                    run_flat = ' RFT' in name
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'

                man_code = self._get_manufacturer_code(full_name)

                metadata['manufacturer_mark'] = man_code

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                try:
                    fuel, grip, noise = product_el.xpath(
                        './/li[contains(@class, "product__meta-item--")]/text()'
                    ).extract()
                except:
                    fuel, grip, noise = ('', '', '')

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                # The website is defaulting to 2 tyres with a discount of £10
                if product.get('price') and (not self.price_discount):
                    product['price'] += Decimal('10')
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata'][
                    'mts_stock_code'] = self.find_mts_stock_code(product)

                yield product
Beispiel #50
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)

        product_name = ''.join(
            hxs.select('//h1[@itemprop="name"]/text()').extract()).strip()
        sku = hxs.select('//span[@itemprop="productID"]/text()').extract()[0]
        img = hxs.select('//img[@class="default-image"]/@src').extract()
        category = response.meta.get('category')
        price = hxs.select(
            '//*[@id="product-price"]//span[@itemprop="price"]/text()'
        ).extract()[0]
        price = extract_price(price)
        brand = hxs.select(
            '//*[@id="product-right-col"]//span[@itemprop="brand"]/text()'
        ).extract()
        brand = brand[0] if brand else ''

        sizes = hxs.select('//*[@id="100000000045"]//input')
        colors = hxs.select('//*[@id="100000000046"]//input')

        if sizes or colors:
            size_variations = []
            for size in sizes:
                size_id = size.select('./@value').extract()[0]
                size_name = size.select(
                    './following-sibling::label/span/text()').extract()[0]
                size_variations.append([size_id, size_name])
            color_variations = []
            for color in colors:
                color_id = color.select('./@value').extract()[0]
                color_name = color.select('./@onclick').extract()[0]
                color_name = re.findall("(?sim)'(.*?)'", color_name)
                color_variations.append([color_id, color_name[-1]])
            if sizes and colors:
                options = itertools.product(size_variations, color_variations)
            else:
                options = color_variations if colors else size_variations
            for option in options:
                product_identifier = sku
                name = product_name
                if sizes and colors:
                    for var in option:
                        product_identifier += '_' + var[0]
                        name += ' ' + var[1]
                else:
                    product_identifier += '_' + option[0]
                    name += ' ' + option[1]
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('identifier', product_identifier)
                loader.add_value('sku', sku)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('price', price)
                price = loader.get_output_value('price')
                loader.add_value('shipping_cost', get_shipping_cost(price))
                loader.add_value('brand', brand)
                if img:
                    loader.add_value(
                        'image_url', urljoin_rfc(get_base_url(response),
                                                 img[0]))
                loader.add_value('category', category)
                yield loader.load_item()
        else:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', sku)
            loader.add_value('sku', sku)
            loader.add_value('url', response.url)
            loader.add_value('name', product_name)
            loader.add_value('price', price)
            price = loader.get_output_value('price')
            loader.add_value('shipping_cost', get_shipping_cost(price))
            loader.add_value('brand', brand)
            if img:
                loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))
            loader.add_value('category', category)
            yield loader.load_item()
Beispiel #51
0
    def parse_page(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        identifier = hxs.select('//input[@name="productCodePost"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//input[@name="productCode"]/@value').extract()

        active_options = hxs.select('//li[contains(@class, "classification-option") and contains(@class, "active")]/a/text()').extract()
        active_options += hxs.select('//li[contains(@class, "swat-list-item-active")]/a/@title').extract()

        item_dict = json.loads(hxs.select('//div[@class="prod-detail"]/@data-ec-product').extract()[0])
        sizes = hxs.select('//ul[@class="classification-list var-change-list"]/li[@class="classification-option prod-var-item var-change-item "]/a/text()').extract()
     
        price = item_dict['price']
        brand = item_dict['brand']
        sku = item_dict['id']
        image_url = hxs.select('/html/head/meta[@property="og:image"]/@content').extract()
        if image_url:
            image_url = image_url[0]
        try:
            stock = hxs.select('//div[@itemprop="availability"]/text()').extract()[0]
        except:
            stock = hxs.select('//div[@class="in-stock"]/text()').extract()[0]
        categories_list = hxs.select('//div[@class="breadcrumb hfma"]/ul/li//span[@itemprop="title"]/text()')[1:-1].extract()


        loader = ProductLoader(item=Product(), response=response)
        title = hxs.select('//h1[@itemprop="name"]/text()').extract()[0]
        if active_options:
            size = hxs.select('//li[contains(@class, "classification-option") and contains(@class, "active")]/a/text()').extract()
            size = ' '.join(size)
            if not title.upper().endswith(size.strip().upper()):
                title += ' ' + size

        loader.add_value('name', title)
        loader.add_value('price', price)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('category', categories_list)
        loader.add_value('image_url', image_url)
        loader.add_value('url', response.url)
        if 'In Stock Online' not in stock:
            loader.add_value('stock', 0)
        loader.add_value('identifier', identifier)
        item = loader.load_item()

        colors = hxs.select('//div[@class="prod-var-group"]/ul[@class="swat-list swat-list-colour var-change-list"]/li/a/@href').extract()
        if colors:
            for color in colors:
                yield Request(urljoin_rfc(base_url, color), callback=self.parse_page)

        sizes = hxs.select('//li[contains(@class, "classification-option")]/a/@href').extract()
        if sizes:
            for size in sizes:
                yield Request(urljoin_rfc(base_url, size), callback=self.parse_page)

        if (colors or sizes) and active_options:
            yield item
        
        if not colors and not sizes:
            yield item
Beispiel #52
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        if hxs.select('//a[@id="anchorUnavailable"]'):
            return

        sellers_url = 'http://www.rakuten.com/PR/SellerListingsAjax.aspx?sku=%s'
        name = hxs.select('//div[@id="product-title"]/h1/text()').extract()[0]
        sku = 0
        for item in re.findall("\d+", name):
            if int(item) > sku:
                sku = int(item)

        if sku == 0 or sku < 100:
            sku = ''.join(
                hxs.select('//th[contains(text(), "Mfg Part#")]/../td/text()').
                extract()).strip()

        brand = hxs.select(
            '//th[contains(text(), "Manufacturer")]/../td/a/text()').extract(
            )[0]
        category = hxs.select(
            '//div[@class="product-breadcrumbs"]//a/text()').extract()[-1]
        image_url = hxs.select('//img[@id="productmain"]/@src').extract()
        identifier = hxs.select(
            '//th[contains(text(), "SKU")]/../td/text()').extract()[0]
        price = hxs.select(
            '//div[@class="main-price"]/span[@itemprop="price"]/text()'
        ).extract()
        price = price[0] if price else 0
        shipping = hxs.select(
            '//div[@class="main-price"]/span[not(@itemprop="price")]/text()'
        ).extract()
        shipping = shipping[0] if shipping else 0
        sellers = hxs.select(
            '//div[@id="seller-contact"]//a[@itemprop="seller"]')
        if sellers:
            yield Request(sellers_url % identifier,
                          callback=self.parse_sellers,
                          meta={
                              'name': name,
                              'brand': brand,
                              'category': category,
                              'identifier': identifier,
                              'sku': sku,
                              'image_url': image_url,
                              'url': response.url
                          })
        else:
            l = ProductLoader(item=Product(), response=response)
            seller_name = hxs.select(
                '//a[@id="anchorMarketplaceShipsFrom"]/text()').extract()
            seller_name = seller_name[0] if seller_name else ''
            if seller_name:
                l.add_value('identifier', identifier + '-' + seller_name)
            else:
                l.add_value('identifier', identifier)
            l.add_value('name', name)
            l.add_value('category', category)
            l.add_value('brand', brand)
            l.add_value('sku', sku)
            l.add_value('url', response.url)
            l.add_value('price', price)
            l.add_value('shipping_cost', shipping)
            l.add_value('image_url', image_url)
            l.add_value('dealer',
                        'Rak - ' + seller_name if seller_name else '')

            yield l.load_item()
Beispiel #53
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        try:
            name = response.xpath(
                '//div[@id="name"]/text()').extract()[0].strip()
        except:
            name = ''

        try:
            desc = response.xpath(
                '//div[@id="type"]/text()').extract()[0].strip()
        except:
            desc = ''

        if desc:
            name = name + ' ' + desc

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        loader.add_value('shipping_cost', 7.50)
        loader.add_value('url', response.url)

        price = response.xpath('//span[@id="price1"]/text()').extract()
        if price:
            price = extract_price(price[0])
            loader.add_value('price', price)

        image_url = response.xpath('//img[@id="productImg"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin(base_url, image_url[0]))

        category = map(
            unicode.strip,
            response.xpath(
                '//ul[@id="breadCrumbs"]//li/a/text()').extract())[1:]
        category = self._get_unified_category(category, name, price)
        if category and ('Side tables' in category[-1]):
            category = ['Tables', 'Side Tables']
        loader.add_value('category', category)

        stock = response.xpath(
            '//*[@id="dispNotShopableOnlineText"]/@style').extract()
        if not stock or 'display:none' in stock[0]:
            stock = 1
        else:
            stock = 0
        loader.add_value('stock', stock)

        loader.add_xpath('sku', '//div[@id="itemNumber"]/text()')
        loader.add_xpath('identifier', '//div[@id="itemNumber"]/text()')

        item = loader.load_item()

        options_data = re.search(r'var jProductData = (.*);', response.body)
        if options_data:
            product_data = json.loads(options_data.groups()[0])

            for option in product_data['product']['items']:
                option_item = deepcopy(item)
                option_item['name'] = option['name']
                description = []
                description.append(option['type'])
                description.extend(option['validDesign'])
                description = ' '.join(description).strip()
                if description:
                    option_item['name'] += ' ' + description

                option_text = response.xpath('//option[@value="' +
                                             option['catEntryId'] +
                                             '"]/text()').extract()
                option_text = option_text[0].strip() if option_text else ''
                if option_text and option_text.upper(
                ) not in option_item['name'].upper():
                    option_item['name'] += ' ' + option_text

                url = option['url']
                option_item['url'] = urljoin(base_url, url)

                option_item['price'] = Decimal(
                    option['prices']['normal']['priceNormal']['rawPrice'])
                identifier = option['partNumber'].replace('S', '')
                identifier = '.'.join(identifier[i:i + 3]
                                      for i in range(0, len(identifier), 3))
                option_item['identifier'] = identifier
                option_item['sku'] = identifier
                if option_item['identifier']:
                    yield option_item

        else:
            if item['identifier']:
                yield item
Beispiel #54
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        childMap = json.loads(
            re.search('\'childMap\': (.*),', response.body).group(1))
        prices = json.loads(
            re.search('\'prices\': (.*),', response.body).group(1))
        skus = json.loads(re.search('\'skus\': (.*),', response.body).group(1))
        stockStatuses = json.loads(
            re.search('\'stockStatuses\': (.*),', response.body).group(1))

        selects = []
        for sel in hxs.select('//div[@class="product-options"]//select'):
            s = []
            for opt in sel.select('.//option'):
                if opt.select('./@value').extract()[0]:
                    s.append((
                        opt.select('./@value').extract()[0],
                        opt.select('./text()').extract()[0],
                    ))
            if s:
                selects.append(s)

        if not selects:
            selects = [[('', ''), ('%', '')]]

        for k, v in list(childMap.items()):
            if '_%' in k:
                childMap[k.replace('_%', '')] = v

        found = False
        for c in itertools.product(*selects):
            key = [x[0] for x in c]
            name = [x[1] for x in c]
            code = childMap.get('_'.join(key))
            if not code: continue

            code = str(code)
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
            loader.add_value('name', name)
            loader.add_value('sku', skus[code])
            loader.add_value('identifier', skus[code])
            loader.add_value('price', prices[code][0]['purchase'])
            loader.add_value('url', response.url)
            loader.add_value('brand', 'Le Creuset')
            if 'In stock' in stockStatuses.get(code, ''):
                loader.add_value('stock', '1')
            else:
                loader.add_value('stock', '0')

            if loader.get_output_value('price') < 45:
                loader.add_value('shipping_cost', '4.95')
            else:
                loader.add_value('shipping_cost', '0')

            loader.add_xpath('category',
                             '//div[@class="crumbs"]/a[position()>2]/text()')
            image_url = hxs.select(
                '//div[@id="product-image"]//img/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            item = loader.load_item()
            metadata = LeCreusetMeta()
            item['metadata'] = metadata

            found = True
            yield item

        if not found:
            self.log('No products on %s' % response.url)
    def parse_product(self, response):
        html = response.body.replace('&amp;', '&')
        hxs = HtmlXPathSelector(text=html)

        identifier = hxs.select('//input[@id="pid"]/@value').extract()
        if not identifier:
            self.log('PRODUCT WITHOUT IDENTIFIER: ' + response.url)
            return

        loader = ProductLoader(item=Product(), response=response)
        name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0]
        if name.startswith(':'):
            name = name[1:]
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand', ''))

        categories = hxs.select(
            '//a[@class="breadcrumb-element"]/@href/../text()').extract()
        categories.remove('Home')
        loader.add_value('category', categories)

        loader.add_value('sku', identifier[0])
        loader.add_value('identifier', identifier[0])
        loader.add_xpath('image_url',
                         '//div[@class="product-primary-image"]/a/@href')

        loader.add_xpath(
            'price',
            '//div[@id="product-content"]//span[@class="price-sales"]/meta/@content'
        )

        out_of_stock = hxs.select('//p[contains(@class, "not-available")]')
        if out_of_stock:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') <= 59.99:
            loader.add_value('shipping_cost', 1.99)

        item = loader.load_item()

        if item.get('price', None) and item['price'] <= 59.99:
            item['shipping_cost'] = 1.99

        options = hxs.select('//select')
        for option in options:
            for variant in option.select('./option'):
                if variant.select('./@selected'):
                    var_name = variant.select(
                        './text()').extract()[0].strip().replace('&amp;', '&')
                    item['name'] += ' ' + var_name
                else:
                    option_url = variant.select(
                        './@value').extract()[0].replace(
                            '&amp;', '&') + '&Quantity=1&uuid=&format=ajax'
                    meta = response.meta
                    meta['item'] = deepcopy(item)
                    meta['base_name'] = name
                    yield Request(option_url,
                                  callback=self.parse_option,
                                  meta=meta)

        if item.get('price', None):
            yield item
    def parse_product(self, response):
        for url in response.xpath('//a[contains(@class,"size-boxes")]/@href').extract():
            yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product)

        product_name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0]

        product_image = response.xpath('//a[@id="zoom-btn"]/@href').extract()
        if product_image:
            product_image = urljoin_rfc(get_base_url(response), product_image[0])

        product_brand = response.xpath("//table[@id='product-attribute-specs-table']/tbody/"
                                       "tr[th[text()='Manufacturer']]/td/text()").extract()[0]
        product_brand = product_brand[0] if product_brand else ''

        product_config_reg = re.search('var spConfig = new Product.Config\((\{.*\})\);', response.body)
        product_identifier = response.xpath('//input[@name="product"]/@value').extract()[0]

        if product_config_reg:
            products = json.loads(product_config_reg.group(1))
            for identifier, product in products['childProducts'].items():
                product_loader = ProductLoader(item=Product(), response=response)
                if identifier:
                    product_loader.add_value('identifier', product_identifier + '-' + identifier)
                else:
                    product_loader.add_value('identifier', product_identifier)
                product_loader.add_value('price', product[u'finalPrice'])
                option_name = product_name
                for attr_id, attribute in products[u'attributes'].items():
                    for option in attribute['options']:
                        if identifier in option['products']:
                            option_name += ' ' + option['label']
                product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', option_name))
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', product_brand)
                product_loader.add_value('image_url', product_image)

                if identifier:
                    yield Request('http://www.bedworld.net/oi/ajax/co/?id=' + identifier + '&pid=' + product_identifier,
                                  meta={'item': product_loader.load_item()},
                                  callback=self.parse_options)
                else:
                    price = product_loader.get_output_value('price')
                    net_price = price / Decimal('1.2')

                    p = product_loader.load_item()
                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    p['metadata'] = meta_

                    yield p
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', product_name))
            product_loader.add_value('brand', product_brand)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('url', response.url)
            product_loader.add_value('image_url', product_image)
            price = response.xpath('//span[@id="product-price-' + product_identifier + '"]//text()').re(r'([\d.,]+)')
            price = price[0] if price else 0
            product_loader.add_value('price', price)

            option_elements = []
            dropdown_elements = response.xpath('//select[contains(@class, "product-custom-options")]')
            for dropdown_options in dropdown_elements:
                options = []
                for dropdown_option in dropdown_options.select('option[@value!=""]'):
                    option = {}
                    option['identifier'] = dropdown_option.select('@value').extract()[0]
                    option['desc'] = dropdown_option.select('.//text()').extract()[0].split('+')[0]
                    option['price'] = dropdown_option.select('@price').extract()[0]
                    options.append(option)
                option_elements.append(options)

            final_options = []
            if option_elements:
                combined_options = list(itertools.product(*option_elements))
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get('desc', '') + option['desc']
                        final_option['price'] = final_option.get('price', Decimal(0)) + extract_price(option['price'])
                        final_option['identifier'] = final_option.get('identifier', '') + '-' + option['identifier']
                    final_options.append(final_option)

            if final_options:
                for opt in final_options:
                    opt_product = product_loader.load_item()
                    opt_product['name'] += ' ' + normalize_space(opt['desc'])
                    opt_product['price'] += opt['price']
                    opt_product['identifier'] += opt['identifier']
                    price = Decimal(opt_product['price'])
                    net_price = price / Decimal('1.2')

                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    opt_product['metadata'] = meta_

                    yield opt_product
            else:
                price = product_loader.get_output_value('price')
                net_price = price / Decimal('1.2')

                p = product_loader.load_item()
                meta_ = Meta()
                meta_['net_price'] = str(net_price)
                p['metadata'] = meta_

                yield p
    def parse_product(self, response):
        base_url = get_base_url(response)

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        try:
            product_identifier = response.xpath(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = response.xpath(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')
            if not product_identifier:
                yield Request(response.url,
                              callback=self.parse_product,
                              dont_filter=True)
                return
            else:
                product_identifier = product_identifier[0]

        product_name = response.xpath(
            '//h2[@itemprop="name"]/text()').extract()[0]

        brand = response.meta.get('brand', '')

        category = 'Used Equipment'
        sku = response.xpath('//div[@class="quickfind"]/text()').extract()
        sku = sku[0].replace('Quick find', '').strip() if sku else ''
        price = response.xpath(
            '//*[@id="product-price-{}"]/div/span[@class="price"]/text()'.
            format(product_identifier)).extract()[0]
        price_pennies = response.xpath(
            '//*[@id="product-price-{}"]/div/span[@class="price"]/span[@class="price-pennies"]/text()'
            .format(product_identifier)).extract()
        if price_pennies:
            price += price_pennies[0]
        price = extract_price(price)
        cashback = response.xpath('//div[@class="cashback"]/text()').extract()
        if cashback:
            price += extract_price(cashback[0])
        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr_id, attr in product_data['attributes'].iteritems():
                for option in attr['options']:
                    option_price = extract_price(option['price'])
                    for product in option['products']:
                        products[product] = ' '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = option_price

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(),
                                               response=response)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name',
                                         product_name + ' ' + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                product_loader.add_value('url', response.url)
                product_loader.add_value('category', category)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                product_loader.add_value('price', price + prices[identifier])
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            product_loader.add_value('price', price)
            product = product_loader.load_item()
            yield product
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        warning = ''.join(
            hxs.select(
                '//div[@class="InfoBanner" and contains(text(), "has returned 0 results")]//text()'
            ).extract())
        if not warning:
            warning = ''.join(
                hxs.select(
                    '//div[@class="noSearchResultsFound"]/text()').extract())

        if warning:
            self.log(warning)
            return

        many = hxs.select(
            '//div[@id="SearchResults"]//div[@class="categoryGridTitle"]/a/@href'
        ).extract()
        if many:
            for url in many:
                yield Request(urljoin(get_base_url(response), url),
                              callback=self.parse_product)
            return

        if hxs.select(
                '//div[@class="color:red" and contains(text(), "this item is no longer available")]'
        ):
            self.log('Item not available [%s]' % (response.url))
            return

        loader = ProductLoader(item=Product(), selector=hxs)

        comms_no = hxs.select(
            '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()'
        ).extract()[0].upper()

        loader.add_value('identifier', comms_no)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@id="productTitle"]//text()')
        loader.add_xpath(
            'price', '//div[@id="productMainPrice"]/span[@id="price"]/text()')
        loader.add_xpath(
            'sku',
            '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()'
        )

        category = hxs.select(
            '//div[@class="newbreadcrumbText"]//text()').extract()[1:-1]
        loader.add_value('category', category)
        img = hxs.select('//span[@id="mainImage"]/a/img/@src').extract()

        if len(img[0]) < 255:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), img[0]))
        else:
            loader.add_value('image_url', '')

        loader.add_xpath('brand', '//div[@id="supplierLogo"]/img/@title')
        if not loader.get_output_value('brand'):
            loader.add_value('brand',
                             loader.get_output_value('name').split()[0])

        if loader.get_output_value('price') < 20:
            loader.add_value('shipping_cost', '2.95')
        else:
            loader.add_value('shipping_cost', '0')
        in_stock = 'IN STOCK' in ''.join(
            hxs.select(
                '//div[@id="stockCheck"]/div/text()').extract()).upper()
        if in_stock:
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        manufacturers_no = hxs.select(
            '//span[@id="manufactNo"]/text()').extract()
        if not manufacturers_no:
            manufacturers_no = hxs.select(
                '//tr[td[contains(text(), "Manufacturer No:")]]/td[not(@class)]/text()'
            ).extract()
        if not manufacturers_no:
            manufacturers_no = hxs.select(
                '//tr[td[contains(text(), "Manufacturer No:")]]/td[2]//text()'
            ).extract()
        if not manufacturers_no:
            manufacturers_no = hxs.select(
                '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()'
            ).extract()
        manufacturers_no = manufacturers_no[0].strip()
        m = sku_regex.search(manufacturers_no)
        if m:
            manufacturers_no = m.group(1)

        product = loader.load_item()

        product['metadata'] = {'manufacturers_no': manufacturers_no}

        self.yield_item_with_metadata(product)
        return
Beispiel #59
0
    def parse(self, response):
        products = response.xpath(
            '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]'
        )

        for product in products:
            winter_tyre = product.xpath(
                '@data-filter-season').extract()[0] == 'Winter'
            if not winter_tyre:
                name = product.xpath(
                    './/div[contains(@class, "tyre-model text-center")]/text()'
                ).extract()[0]
                brand = product.xpath('@data-filter-brand').extract()[0]

                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', brand + ' ' + name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product.xpath('@data-tyreid').extract()[0]
                loader.add_value('identifier', identifier)
                loader.add_value('url', response.url)
                image_url = product.xpath(
                    './/div[contains(@class, "tyre-image")]//img/@src'
                ).extract()
                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))
                price = product.xpath(
                    './/div[contains(@class, "tyre-pricing-information")]/div/text()'
                ).re(r'[\d,.]+')
                price = price[0] if price else '0.00'
                loader.add_value('price', price)
                tyresize_text = product.xpath(
                    './/div[contains(@class, "tyre-size")]/text()').extract(
                    )[0].strip()
                try:
                    width, aspect, speed_rating, rim, load_rating = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text,
                        re.I).groups()
                except:
                    width, aspect, speed_rating, rim = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text,
                        re.I).groups()
                    load_rating = ''

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = load_rating
                metadata['alternative_speed_rating'] = ''
                xl = product.xpath(
                    '@data-filter-reinforced').extract()[0] == 'Y'
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat(loader.get_output_value('name'))
                run_flat = product.xpath(
                    '@data-filter-runflat').extract()[0] == 'Y'
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'
                manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\
                                           .re(r'Homologated for fitment to certain (.*) cars\.')

                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark[0]) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr'
                                                  '|@data-filter-tyreefficiencyg'
                                                  '|@data-filter-tyreefficiencyd')\
                                           .extract()
                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                yield product