Ejemplo n.º 1
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//div[@class="rightcol"]//td[contains(child::text(),"\xa3")] | //div[@class="rightcol"]//td[child::h1]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './a/text()')
            product_loader.add_xpath('name', './h1/text()')
            url = product.select('./a/@href').extract()
            if not url:
                url = response.url
            else:
                url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            price = product.select('./text()').re('\xa3(.*)')
            if not price:
                price = product.select('.//span[@id="_EKM_PRODUCTPRICE"]/text()').extract()
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Ejemplo n.º 2
0
    def parse_item(self, response):
        url = response.url
        sku = response.meta['sku']
        notes = response.meta['notes']
        name = response.meta['name'].encode('ascii', 'ignore')

        try:
            hxs = HtmlXPathSelector(response)

            price = hxs.select("//table[@class='productDetail']//span[@id='offer_price']/text()").extract()
            if not price:
                logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url))
                return
            price = price[0].strip()

            product = Product()
            loader = ProductLoader(item=product, response=response, selector=hxs)
            loader.add_value('identifier', sku)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)

            loader.add_value('sku', sku)

            yield loader.load_item()

        except lxml.etree.XMLSyntaxError:
            logging.error("Rerequesting")
            yield Request(
                url,
                callback=self.parse_item,
                meta={'sku': sku, 'notes': notes, 'name': name},
                dont_filter=True
            )
Ejemplo n.º 3
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)


        products = hxs.select('//ul[@class="product-list"]/li')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)

            name = product.select('.//h2/a/strong/text()').extract()[0]
            extra_name = product.select('.//h2/a/text()').extract()
            if extra_name:
                name += ' ' + extra_name[0]
            product_loader.add_value('name', name)
            url = product.select('.//h2/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('price', u'.//p/strong/text()', re='\xa3(.*)')
            yield product_loader.load_item()

        if not products and not response.meta.get('retry'):
            yield Request(response.url, callback=self.parse_product, dont_filter=True,
                          cookies={}, meta={'dont_merge_cookies': True, 'retry': True})
Ejemplo n.º 4
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page, callback=self.parse_product)

        products = hxs.select(u'//div[contains(@class,"itemGrid")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0]
            name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0]
            product_loader.add_value('name', name)
            product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()',
                                 re=u'\$(.*)')
            product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()',
                                 re=u'\$(.*)')
            loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price')
            if not loaded:
                continue
            yield product_loader.load_item()
Ejemplo n.º 5
0
    def browse_and_parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract():
            subsubcat_url = urlparse.urljoin(base_url, subcat_href)
            if subsubcat_url not in self.navig_url_set:
                self.navig_url_set.add(subsubcat_url)
                yield Request(subsubcat_url, callback=self.browse_and_parse)

        next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href")
        if next_page:
            yield Request(next_page[0].extract(), callback=self.browse_and_parse)

        # parse product listing in this page, if any
        for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'):
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0])
            product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0])
            product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

        # edge case: product listing page with a single product
        product_price = hxs.select('//h2[@id="productPrices"]/text()').extract()
        if product_price:
            # this product listing page contains a single product
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()
Ejemplo n.º 6
0
    def parse_option_price(self, response):
        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_value('name', response.meta['name'])
        product_loader.add_value('url', response.meta['url'])
        product_loader.add_xpath('price', u'//div[@class="webPriceLabel"]/text()',
                                     re=u'\xa3(.*)')
        yield product_loader.load_item()
Ejemplo n.º 7
0
Archivo: katom.py Proyecto: 0--key/lib
    def parse_product(self, response):
      hxs = HtmlXPathSelector(response)

      loader = ProductLoader(item=Product(), response=response)
      loader.add_value('url', response.url)
      loader.add_xpath('name', '//h1[@id="top_product_info_block_product_title_text"]/text()')
      loader.add_xpath('sku', '//ul[@id="top_product_info_block_product_data_list"]/li/strong/text()')
      loader.add_xpath('price', '//p[@id="top_product_info_block_product_data_new_low_price"]/text()')
      yield loader.load_item()
Ejemplo n.º 8
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', u'//form/div[not(@class)]/h1[not(@class)]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('price', u'//form//div[@class="contentText"]//div[@class="PriceList"]/div[@class="pricenow"]/text()', re=u'\xa3(.*)')
        product_loader.add_xpath('sku', u'//td[@class="ProductPageSummaryTableInfo" and preceding-sibling::td[@class="ProductPageSummaryTable" and contains(text(),"Model Number")]]/text()')
        yield product_loader.load_item()
Ejemplo n.º 9
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        product_loader.add_xpath('price', '//div[@class="club"]/span[@itemprop="Price"]/text()',
                                 re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        return product_loader.load_item()
Ejemplo n.º 10
0
 def parse_products(self, hxs, base_url):
     products = hxs.select('//div[@class="productResultInfo"]')
     for product in products:
         product_loader = ProductLoader(Product(), product)
         product_loader.add_xpath('name', './/a[@class="ProductNameText"]/text()')
         url = product.select('.//a[@class="ProductNameText"]/@href').extract()[0]
         product_loader.add_value('url', urljoin_rfc(base_url, url))
         price = ' '.join(product.select('.//span[@class="variantprice"]//text()').extract())
         product_loader.add_value('price', price)
         product_loader.add_xpath('sku', './/p[contains(@class, "productSKU")]/text()')
         yield product_loader.load_item()
Ejemplo n.º 11
0
    def parse_product(self, response):
        compound = [product for product in self._parse_compound_product(response)]
        if compound:
            for product in compound:
                yield product
            return

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('name', '//h1[@itemprop="Name"]//text()')
        loader.add_xpath('price', '//input[@name="price"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('sku', '//span[@itemprop="model"]/text()')
        yield loader.load_item()
Ejemplo n.º 12
0
    def parse_several_products_single_product_page(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            "//table[@class='product_body']/tr/td[2]/p[not(@class)][*[local-name()='strong']]"
        )
        for product_el in products:
            name = product_el.select("strong[1]//text()").extract()
            if not name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            name = name[0]

            url = response.url

            price = product_el.select(
                'strong[2]/text() | b[last()]/text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" %
                              (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 13
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select(
            "//div[@class='product-shop']/div[@class='product-name']/h2/text()"
        ).extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        # adding product
        price = hxs.select(
            "//div[@class='product-shop']/div[@class='price-box']//span[@class='price']/text()"
        ).extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(".", "").replace(",", ".")
        #        price_delivery = hxs.select("//div[@class='product-shop']//table[@id='product-attribute-specs-table']/tr/td[(preceding::th[text()='Spese Spedizione'])]/text()").extract()
        #        if not price_delivery:
        #            logging.error("NO PRICE DELIVERY! %s" % url)
        #            return
        #        price_delivery = price_delivery[0]
        #        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 14
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1/text()').extract()[0]
        
        multiple_prices = hxs.select('//select[@class="smalltextblk"]/option/text()').extract()
        single_special_price = hxs.select('//span/text()').re('\xa3(.*[0-9]+)')
        single_price = hxs.select('//td[@class="ProductPrice"]/text()').re('\xa3(.*[0-9])')
        
        products_data = []

        if single_price and not multiple_prices:
            price = single_price[0] if not single_special_price else single_special_price[0]
            products_data.append((name, price))
        else:
            multiple_prices = multiple_prices[1:]
            for name_and_price in multiple_prices:
                name_and_price = re.match('(.*)\xa3(.*\.[0-9]+)', name_and_price).groups()
                products_data.append((name + ' ' + name_and_price[0], name_and_price[1]))

        for item in products_data:
            product = Product()
            loader = ProductLoader(item=product, response=response)
            # try:
            loader.add_value('url', response.url)
            loader.add_value('name', item[0])
            loader.add_value('price', item[1])

            loader.add_value('sku', '')

            yield loader.load_item()
Ejemplo n.º 15
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//form/div[@id="price"]')

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name',
                                     u'//div[@class="product"]/h1/text()')
            price = hxs.select(
                u'//div[@class="product"]//p[@class="price1"]/text()').re(
                    u'\xa3(.*)')
            if not price:
                return
            product_loader.add_value('price', price)
            yield product_loader.load_item()
        else:
            for product in products:
                product_loader = ProductLoader(item=Product(),
                                               selector=product)
                product_loader.add_xpath('name', u'./h4/text()')
                product_loader.add_value('url', response.url)
                price = product.select(u'.//p[@class="price1"]/text()').re(
                    '\xa3(.*[0-9])')
                if not price:
                    continue
                product_loader.add_value('price', price)
                yield product_loader.load_item()
Ejemplo n.º 16
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()
Ejemplo n.º 17
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        items = hxs.select("//div[@id='content']/div[@id='searchcontent']/div[@class='hasResults']/form/div[@id='switchview']/ol/li")
        for item in items:
            name = item.select("ul/li[@class='producttitle']/h4/a[1]/text()").extract()
            if not name:
                continue
            name = name[0]
            url = item.select("ul/li[@class='producttitle']/h4/a[1]/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select("ul/li[contains(@class, 'pricing')]/ul/li[contains(@class, 'price')]/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', name)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Ejemplo n.º 18
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0]
        price = Decimal(extract_price2uk(price))

        eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract()
        if eco_tax:
            eco_tax[0] = eco_tax[0].encode("ascii", "ignore")
            print "Found eco tax %s" % eco_tax[0]
            price -= Decimal(extract_price2uk(eco_tax[0]))

        l = ProductLoader(item=Product(), response=response)
        l.add_value("identifier", str(name))
        l.add_value("name", name)
        l.add_value("url", url)
        l.add_value("price", unicode(price))
        yield l.load_item()
Ejemplo n.º 19
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select(
            "//form[@id='handleBuy']/div[@class='buying']/h1[@class='parseasinTitle']/span/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@id='priceBlock']//tr[@id='actualPriceRow']//b[@class='priceLarge']/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = price[0]

        description = u''

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 20
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@id='productDetail']/form/fieldset/h2/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\
                              span[@class='pounds']/text()").extract()
        if not price:
            price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\
                              span[@class='newPrice']/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! %s %s" % (url, name))
                return
        price = "".join(price)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 21
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        # parse pages
        pages = hxs.select("//ul[@id='pagination']/li/a/@href").extract()
        for page in pages:
            request = Request(page, callback=self.parse_search)
            yield request

        # parse products
        items = hxs.select("//div[@class='column_one grid_list']/div")
        for item in items:
            name = item.select("div/div[@class='info']/div/h2/a/text()").extract()
            if not name:
                continue
            name = name[0]

            url = item.select("div/div[@class='info']/div/h2/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            price = item.select("div/div[@class='pricebox']/p[@id='product-price']/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value("identifier", name)
            l.add_value("name", name)
            l.add_value("url", url)
            l.add_value("price", price)
            yield l.load_item()
Ejemplo n.º 22
0
    def parse_several_products_single_product_page(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//table[@class='product_body']/tr/td[2]/p[not(@class)][*[local-name()='strong']]")
        for product_el in products:
            name = product_el.select("strong[1]//text()").extract()
            if not name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            name = name[0]

            url = response.url

            price = product_el.select('strong[2]/text() | b[last()]/text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 23
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@class='product-shop']/div[@class='product-name']/h2/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        # adding product
        price = hxs.select("//div[@class='product-shop']/div[@class='price-box']//span[@class='price']/text()").extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(".", "").replace(",", ".")
#        price_delivery = hxs.select("//div[@class='product-shop']//table[@id='product-attribute-specs-table']/tr/td[(preceding::th[text()='Spese Spedizione'])]/text()").extract()
#        if not price_delivery:
#            logging.error("NO PRICE DELIVERY! %s" % url)
#            return
#        price_delivery = price_delivery[0]
#        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 24
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        #name = hxs.select('//div[@id="package_showcase"]/div[@id="description"]/h1/text()').extract()
        name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if not name:
            print "ERROR!! NO NAME!! %s" % url
            return
        name = name[0]

        #price = hxs.select('//div[@id="package_showcase"]/div[@id="pricing"]/strong[last()]/text()').extract()
        price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if not price:
            print "ERROR!! NO PRICE!! %s" % url
            return
        price = price[-1]

        product = Product()
        loader = ProductLoader(item=product, response=response)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)

        loader.add_value('sku', response.url.split('/')[-2])

        yield loader.load_item()
Ejemplo n.º 25
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='pageContentSub']/div[@class='moduleBox']/\
                             div[@id='top_breadcrumb_link']/a[last()]/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]
                 
        # adding product
        price = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\
                             div[@class='content']/div[@class='details']/ul/li[1]/span[2]/text()").re(u'€ (.*)')
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(",", "")
        price_delivery = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\
                             div[@class='content']/div[@class='details']/ul/li[2]/span[2]/text()").re(u'€ (.*)')
        if not price_delivery:
            logging.error("NO PRICE DELIVERY! %s" % url)
            return
        price_delivery = price_delivery[0].replace(",", "")
        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 26
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@class='product-overview']/h1[1]/text()").extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/strong/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return

        price = price[0]

        price2 = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/text()").extract()

        if price2:
            price += price2[0]


        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 27
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//h1[@class='product-name']/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]
                 
        # adding product
        price = hxs.select("//div[@class='price-box']//span[@class='price']/text()").re(u'€ (.*)')
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0].replace(".", "").replace(",", ".")

        price_delivery = hxs.select("//div[@class='product-shop']/\
            text()[(preceding::div[@class='price-box']) and (following::div[@class='add-to-holder'])]"
        ).re(u'€\xa0([\d,.]*)')
        if not price_delivery:
            logging.error("NO PRICE DELIVERY! %s" % url)
            return
        price_delivery = price_delivery[0].replace(".", "").replace(",", ".")
        price = Decimal(price) + Decimal(price_delivery)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name.encode("ascii", "ignore"))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 28
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)

        name = hxs.select('//h1/span[@itemprop="name"]/text()').extract()
        if not name:
            yield Request(response.url, dont_filter=True, meta=response.meta, callback=self.parse_products)
        url = response.url
        loader.add_value('url', urljoin_rfc(base_url, url))
        loader.add_value('name', name)
        image_url = hxs.select('//div[@id="carousel-example-generic"]/div/div[1]/img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        category = hxs.select('//ol[@id="Ol1"]/li/a//text()').extract()
        if category:
            loader.add_value('category', category[-1])

        options = hxs.select('//select[@id="VoucherOption"]/option')

        for option in options:
            value = option.select('@value').extract()[0]
            tries = response.meta.get('try', 0)
            yield FormRequest('https://www.intotheblue.co.uk/Product/PriceandRef', formdata={'SelectedValue':value},
                              dont_filter=True, meta={'loader': loader, 'cookiejar': response.meta['cookiejar'], 'try':tries+1}, 
                              headers={'Referer':response.url}, callback=self.parse_options)
        
        if not options and name:
            loader.add_xpath('identifier', '//label[@id="lblRef"]/text()')
            loader.add_xpath('sku', '//label[@id="lblRef"]/text()')
            loader.add_xpath('price', '//label[@id="lblProductPrice"]//text()')
            yield loader.load_item()
Ejemplo n.º 29
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url

        name = hxs.select("//div[@class='primary-content']//div[@id='product-summary']/h1/text()").extract()

        if not name:
            name = hxs.select('//h1/text()').extract()
        if not name:
            logging.error("ERROR! NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()").extract()
        if not price:
            logging.error("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = "".join(price)

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', name)
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()
Ejemplo n.º 30
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        soup = BeautifulSoup(response.body)

        products = soup.findAll('a', href=re.compile('ProductDetail'))
        products = {product.parent.parent for product in products}

        for product in products:
            product_loader = ProductLoader(item=Product(), response=response)
            name = product.findAll('font')[1].text
            price = product.find('nobr', text=re.compile('\$'))
            url = product.find('a', href=re.compile('ProductDetail'))
            if url:
                url = urljoin_rfc(get_base_url(response), url['href'])
            else:
                url = response.url
            product_loader.add_value('name', name)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('url', url)
            product_loader.add_value('sku', response.meta['sku'])
            #product_loader.add_value('identifier', response.meta['sku'])
            site_mfrgid = product.find('nobr').text
            if site_mfrgid:
                site_mfrgid = site_mfrgid.strip().lower()
                mfrgid = response.meta['mfrgid'].strip().lower()
                if site_mfrgid == mfrgid:
                    yield product_loader.load_item()
Ejemplo n.º 31
0
    def parse_table_options_type2_single_product_page(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='mainContent']/center/table/tr[1]/td[1]/p[2][not(@class)][*[local-name()='strong']]/strong[1]//text()").extract()
        if not name:
            logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
            return
        name = name[0]
        subproducts = hxs.select("//div[@id='mainContent']/center/table//table[@class='product_body']/tr[position()>1]")
        for product_el in subproducts:
            add_name = product_el.select("td[1]//text()").extract()
            if not add_name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            add_name = add_name[0]

            url = response.url

            price = product_el.select('td[3]//text()').extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', "%s %s" % (name, add_name))
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 32
0
    def parse_product_list_columns(self, response):
        hxs = HtmlXPathSelector(response)
        products_count = hxs.select("count(//table[@class='product_body']/tr[3]/td)").extract()[0]
        for i in range(1, int(float(products_count))+1):
            name = hxs.select("//table[@class='product_body']/tr[3]/td[%d]/p//text()" % i).extract()
            if not name:
                logging.error("ERROR!! NO NAME!! %s" % (response.url, ))
                continue
            name = name[0]

            url = response.url

            price = hxs.select("//table[@class='product_body']/tr[4]/td[%d]/p[1]/strong[last()]//text()" % i).extract()
            if not price:
                logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url))
                continue

            price = price[0]
            if re.search(prices_range_regex, price):
                yield Request(url, callback=self.parse_product_list)
                continue

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 33
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-name fn"]/h1/text()').extract()[0]

        multiple_prices = hxs.select(u'//table[@id="super-product-table"]//tr')
        if not multiple_prices:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('price', u'//div[@class="price-box"]/span[contains(@id,"product-price")]/span[@class="price"]/text()',
                                     re='\xa3(.*[0-9])')
            product_loader.add_xpath('price', u'//div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
                                     re='\xa3(.*[0-9])')
            yield product_loader.load_item()
        else:
            for name_and_price in multiple_prices:
                product_loader = ProductLoader(item=Product(), selector=name_and_price)
                name_options = name_and_price.select(u'./td[position()=1]/text()').extract()[0]
                product_loader.add_value('name', name + ' ' + name_options)
                product_loader.add_value('url', response.url)
                product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/span[@class="regular-price"]/span[@class="price"]/text()',
                                         re=u'\xa3(.*)')
                product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
                                         re=u'\xa3(.*)')
                yield product_loader.load_item()
Ejemplo n.º 34
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        # sub products
        hxs = HtmlXPathSelector(response)

        # multiple prices
        name = hxs.select('//h1/text()').extract()[0]
        multiple_prices = hxs.select('//option/text()').extract()
        single_price = hxs.select('//span/b/text()').re('\xa3(.*)')
        products_data = []
        if not single_price:
            for name_and_price in multiple_prices:
              #  try:
                name_and_price = re.sub('[\t\r\n]', '', name_and_price).strip()
                products_data.append(re.match('(.*[0-9,a-z,A-Z\)]).*\xa3(.*[0-9])', name_and_price).groups())
              #  except AttributeError:
              #      continue
        else:
            price = single_price[0]
            products_data.append((name, price), )

        for item in products_data:
            product = Product()
            loader = ProductLoader(item=product, response=response)
            # try:
            loader.add_value('url', response.url)
            loader.add_value('name', item[0])
            loader.add_value('price', item[1])

            loader.add_value('sku', '')

            yield loader.load_item()
Ejemplo n.º 35
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//form/div[@id="price"]')

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name', u'//div[@class="product"]/h1/text()')
            price = hxs.select(u'//div[@class="product"]//p[@class="price1"]/text()').re(u'\xa3(.*)')
            if not price:
                return
            product_loader.add_value('price', price)
            yield product_loader.load_item()
        else:
            for product in products:
                product_loader = ProductLoader(item=Product(), selector=product)
                product_loader.add_xpath('name', u'./h4/text()')
                product_loader.add_value('url', response.url)
                price = product.select(u'.//p[@class="price1"]/text()').re('\xa3(.*[0-9])')
                if not price:
                    continue
                product_loader.add_value('price', price)
                yield product_loader.load_item()
Ejemplo n.º 36
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath('price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        product_loader.add_value('identifier', response.meta['sku'].lower())

        name = hxs.select(u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()').extract()[0].strip()
        product_loader.add_value('name', name)

        # sku = response.meta['sku'].lower().split(' ')
        # name = product_loader.get_output_value('name').lower()
        # sku = filter(lambda x: x != '' and x in name, sku)
        part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re('Part No. (.*)')[0]
        mfrgid = response.meta['mfrgid']
        if part_number == mfrgid and product_loader.get_output_value('price'):
            yield product_loader.load_item()
Ejemplo n.º 37
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        sku = response.meta['sku']

        products = hxs.select("//ul[@id='categoryproductWrapper']/li/div")
        for product in products:
            prod_sku = product.select(
                ".//div[@class='modelNumber']/text()").extract()
            if not prod_sku:
                logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url))
                return
            prod_sku = prod_sku[0].strip()

            if prod_sku == sku:
                name = product.select("header/a/text()").extract()
                if not name:
                    logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url))
                    return
                name = name[0].strip()

                url = product.select("header/a/@href").extract()
                if not url:
                    logging.error('ERROR!! NO url!! %s "%s"' % (sku, url))
                    return
                url = url[0].strip()

                price = product.select(
                    ".//div[@class='descPrice']/div/text()").extract()
                if not price:
                    logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' %
                                  (sku, name, url))
                    return
                price = price[0].strip()

                product = Product()
                loader = ProductLoader(item=product,
                                       response=response,
                                       selector=hxs)
                loader.add_value('identifier', sku)
                loader.add_value('url', url)
                loader.add_value('name', name)
                loader.add_value('price', price)

                loader.add_value('sku', sku)

                yield loader.load_item()
                break
Ejemplo n.º 38
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        # parse pages
        pages = hxs.select(
            "//div[@class='searchBarPaging']/ul/li/a/@href").extract()
        for page in pages:
            request = Request(page, callback=self.parse_search)
            yield request

        # parse products
        items = hxs.select(
            "//ul[@class='productListing clearfix']/li/div/div/form")
        for item in items:
            name = item.select(
                "div[@class='listItemInnerMost']/div[@class='prodMiniTop']/h4/a/span/text()"
            ).extract()
            if not name:
                continue
            name = name[0]

            url = item.select(
                "div[@class='listItemInnerMost']/div[@class='prodMiniTop']/h4/a/@href"
            ).extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" %
                              (response.url, name))
                continue
            url = url[0]
            price = item.select(
                "div[@class='listItemInnerMost']/div[@class='prodMiniBottom']/\
                                 span[@class='productPrice']/span[@class='pounds']/text()"
            ).extract()
            if not price:
                price = item.select(
                    "div[@class='listItemInnerMost']/div[@class='prodMiniBottom']/\
                                 span[@class='productPrice']/span[@class='newPrice']/text()"
                ).extract()
                if not price:
                    logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" %
                                  (response.url, name))
                    continue
            price = price[-1]

            unique = item.select(
                "div[@class='hidden']/input[@name='masterproduct_pid']/@value"
            ).extract()
            if not unique:
                logging.error("ERROR! NO UNIQUE! URL: %s. NAME: %s" %
                              (response.url, name))
                continue
            unique = unique[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', "%s %s" % (name, unique))
            l.add_value('name', "%s %s" % (name, unique))
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Ejemplo n.º 39
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//table[@class="list"]//tr')[1:]
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h3/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//h3/a/text()')
            product_loader.add_xpath('price', u'.//p[@class="prixPromo"]/text()',
                                     re=u'([\d\.]+)')
            yield product_loader.load_item()
Ejemplo n.º 40
0
    def parse_image(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=response.meta['product'],
                               selector=response)
        image_url = hxs.select('//div[@class="item"]/a/img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        product = loader.load_item()
        if 'image_url' in product and product['image_url'].strip():
            self.images[product['url']] = product['image_url']

        yield product
Ejemplo n.º 41
0
    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # categories
        categories = hxs.select('//ul[@id="sitemap"]//a')
        categories += hxs.select("//div[@id='side-nav']/ul/li//a")
        for cat in categories:
            category = cat.select('text()').extract()[0].strip()
            url = cat.select('@href').extract()[0]
            url = urljoin_rfc(base_url, url)
            meta = response.meta.copy()
            meta['category'] = category
            yield Request(url, meta=meta)

        # products list
        products = hxs.select('//ul[@id="category-row-items"]/li')
        if not products:
            print "ERROR!! NO PRODUCTS!! %s " % response.url
            logging.error("ERROR!! NO PRODUCTS!! %s " % response.url)
        for product_el in products:
            name = product_el.select(
                './/div[@class="item-name"]/h2/a/text()').extract()
            if not name:
                print "ERROR!! NO NAME!! %s " % response.url
                logging.error("ERROR!! NO NAME!! %s " % response.url)
                continue
            name = name[0]

            url = product_el.select(
                './/div[@class="item-name"]/h2/a/@href').extract()
            if not url:
                print "ERROR!! NO URL!! %s" % response.url
                logging.error("ERROR!! NO URL!! %s " % response.url)
                continue
            url = urljoin_rfc(base_url, url[0])

            price = product_el.select(
                './/div[@class="item-total"]/text()').extract()
            if not price:
                print "ERROR!! NO PRICE!! %s" % response.url
                logging.error("ERROR!! NO PRICE!! %s " % response.url)
                continue
            price = price[0]

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('category', response.meta.get('category', ''))

            yield Request(url,
                          callback=self.parse_product,
                          meta={'loader': loader})

        if not products and '/product/' in response.url:
            for item in self.parse_product(response):
                yield item
Ejemplo n.º 42
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[contains(@class,"product-container")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(
                u'.//a[@class="SearchLinkBold"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(
                u'.//a[@class="SearchLinkBold"]/text()').extract()[0]

            extra_name = product.select(
                u'.//div[contains(@class,"prod-info-box")]/p/text()').extract(
                )
            if extra_name:
                name += ' ' + extra_name[0]

            r = re.search('ID=prod(\d+)', url)
            if r:
                log.msg('Found ' + r.groups()[0])
                name = self.names.get(r.groups()[0], name)
            product_loader.add_value('name', name)
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b[@class="price sale"]/text()',
            #                                 re=u'\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                                 re=u'.*?or 1/\$(.*)')
            #product_loader.add_xpath('price', u'.//div[@class="pricing"]/div[@class="prod-pricing"]/p[not(@class="strike_thru")]/b/text()',
            #                     re=u'\$(.*)')
            product_loader.add_xpath(
                'price', './/p[@class="empPrc"]/span[@class="FSprice"]/text()')
            product_loader.add_xpath('price',
                                     './/p[@class="FSprice"]/text()',
                                     re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price',
                                     './/p[@class="FSprice"]/text()',
                                     re=u'.*?or 1/\$(.*)')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            product_loader.add_xpath('price', './/p[@class="Rprice"]/text()')
            if not product_loader.get_output_value('price'):
                continue
            yield product_loader.load_item()
Ejemplo n.º 43
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[@class="product-listing-2"]/div[contains(@class,"rec")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="description"]/h2/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//div[@class="description"]/h2/a/text()')
            product_loader.add_xpath('price', u'.//span[@class="prod-price"]/text()',
                                 re=u'\$(.*)')
            yield product_loader.load_item()
Ejemplo n.º 44
0
    def parse(self, response):
        URL_BASE = get_base_url(response).replace('%20/', '')
        hxs = HtmlXPathSelector(response)

        # categories
        category_urls = hxs.select(
            "//div[@id='navigation']//a/@href").extract()
        for url in category_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url.replace('%20/', ''))

        # pagesproductlistsortfloatleft
        pages_urls = hxs.select(
            "//div[@id='productlistsortfloatleft']//a/@href").extract()
        for url in pages_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url.replace('%20/', ''))

        # products list
        products = hxs.select("//div[starts-with(@class, 'grid-product')]")
        if not products:
            print "ERROR!! NO PRODUCTS!! %s " % response.url
            logging.error("ERROR!! NO PRODUCTS!! %s" % response.url)
        for product_el in products:
            name = product_el.select(
                ".//div[@class='title']/a//text()").extract()
            if not name:
                continue
            name = name[0].strip()

            url = product_el.select(".//div[@class='title']/a/@href").extract()
            if not url:
                print "ERROR!! NO URL!! %s" % response.url
                continue
            url = url[0]
            url = urljoin_rfc(URL_BASE, url.replace('%20/', ''))

            price = product_el.select(
                ".//div[@class='price ']/text()").extract()
            if not price:
                price = product_el.select(
                    ".//div[@class='price sale']/text()").extract()[1:]
            if not price:
                print "ERROR!! NO PRICE!! %s" % response.url
                continue
            price = price[0]
            m = re.search("(.*?)-(.*?)$", price)
            if m:
                price = m.group(1)

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url.replace('%20/', ''))
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 45
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        options = hxs.select('//select[@id="skulist"]/option/@value').extract()
        if options:
            urls = hxs.select('//div[@class="leftcol"]//a/@href').extract()
            for url in urls:
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_options)

        else:
            name = hxs.select(
                '//div[@id="skuinfo"]/h1[@itemprop="name"]/text()').extract()
            if not name:
                name = hxs.select(
                    '//div[@class="details"]/h1/text()').extract()
            price = "".join(
                hxs.select(
                    '//div[@class="club"]/span[@itemprop="Price"]/text()').re(
                        r'([0-9\,\. ]+)')).strip()
            if not price:
                price = "".join(
                    hxs.select(
                        '//div[@class="details"]/div[@class="special"]/text()'
                    ).re(r'([0-9\,\. ]+)')).strip()
            specs = hxs.select('//div[@id="specs"]/div[@class="special"]')
            model_no = None
            for spec in specs:
                try:
                    spec_text = spec.select('./span/text()').extract()[0]
                    if spec_text == 'Mfg Part #:':
                        model_no = "".join(
                            spec.select("./text()").extract()).strip()
                except:
                    continue

            if name and price:
                sku_ = ''
                if model_no:
                    csv_file = UnicodeReader(
                        open(os.path.join(HERE, 'skus.csv')))
                    for row in csv_file:
                        if row[3] == model_no:
                            sku_ = row[0]
                            break

                product_loader = ProductLoader(item=Product(),
                                               response=response)
                product_loader.add_value('name', name[0])
                product_loader.add_value('sku', sku_)
                product_loader.add_value('price', price)
                product_loader.add_value('url', response.url)
                yield product_loader.load_item()
Ejemplo n.º 46
0
    def parse_products(self, hxs):
        products = hxs.select('//li[contains(@itemtype, "Product")]')
        for product in products:
            product_loader = ProductLoader(Product(), product)
            product_loader.add_xpath('name', './/a[@itemprop="name"]/text()')
            product_loader.add_xpath('url', './/a[@itemprop="name"]/@href')
            product_loader.add_xpath('price',
                                     './/span[@itemprop="price"]/text()')
            sku = product.select('.//span[@itemprop="model"]/text()')
            if sku:
                sku = sku.extract()[0]
                dash_pos = sku.find('-')
                if dash_pos >= 0:
                    sku = sku[dash_pos + 1:]
                product_loader.add_value('sku', sku)

            yield product_loader.load_item()
Ejemplo n.º 47
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select("//div[@id='left-bar']/\
                                   div[@class='menu-dialog menu-categories-list']/\
                                   div[@class='content']//a/@href").extract()
        for category in categories:
            url = urljoin_rfc(base_url, category)
            yield Request(url, callback=self.parse)

        sub_categories = hxs.select(
            "//div[@id='center-main']/div[@class='subcategories']/a/@href"
        ).extract()
        for category in sub_categories:
            url = urljoin_rfc(base_url, category)
            yield Request(url, callback=self.parse)

        items_table = hxs.select(
            "//div[@id='center-main']/div[@class='dialog']/div[@class='content']/table"
        )
        rows = items_table.select("tr")
        i = 0
        rows_count = len(rows)
        while i < rows_count:
            name_row = rows.pop(0)
            image_row = rows.pop(0)
            empty_row = rows.pop(0)
            price_row = rows.pop(0)
            order_row = rows.pop(0)
            for name_cell, price_cell in zip(name_row.select('td'),
                                             price_row.select('td')):
                name = name_cell.select("a/text()").extract()
                if not name:
                    logging.error("%s - ERROR! NO NAME!" % response.url)
                    continue
                name = name[0]
                url = name_cell.select("a/@href").extract()
                if not url:
                    logging.error("%s - ERROR! NO URL!" % response.url)
                    continue
                url = url[0]
                url = urljoin_rfc(base_url, url)
                price = price_cell.select(
                    "div[@class='price-row']/span/text()").extract()
                if not price:
                    logging.error("%s - ERROR! NO PRICE!" % response.url)
                    continue
                price = price[0]
                l = ProductLoader(item=Product(), response=response)
                l.add_value('identifier', str(name))
                l.add_value('name', name)
                l.add_value('url', url)
                l.add_value('price', price)
                yield l.load_item()
            i += 5
Ejemplo n.º 48
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="shopprods"]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './/p/strong/a/text()')
            url = product.select('.//p/strong/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            price = product.select(
                './/span[@class="price"]/text()').extract()[0]
            price = Decimal(price) + Decimal(5)
            price = str(price)
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Ejemplo n.º 49
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath(
            'name', '//div[@id="sku-detail"]//h1[@itemprop="name"]/text()')
        product_loader.add_xpath(
            'price',
            '//div[@class="club"]/span[@itemprop="Price"]/text()',
            re='.*\$(.*[0-9])')
        product_loader.add_xpath('sku',
                                 '//meta[@itemprop="productID"]/@content')
        product_loader.add_xpath('identifier',
                                 '//meta[@itemprop="productID"]/@content')
        product_loader.add_value('url', response.url)
        return product_loader.load_item()
Ejemplo n.º 50
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', '//font[@class="pricecolor colors_productprice"]/text()', re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('sku', '//span[@class="product_code"]/text()')

        sku = product_loader.get_output_value('sku')
        if sku:
            product_loader.add_value('name', sku)
        else:
            product_loader.add_xpath('name', '//font[@class="productnamecolorLARGE colors_productname"]/text()')

        return product_loader.load_item()
Ejemplo n.º 51
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath(
            'name', u'//form/div[not(@class)]/h1[not(@class)]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_xpath(
            'price',
            u'//form//div[@class="contentText"]//div[@class="PriceList"]/div[@class="pricenow"]/text()',
            re=u'\xa3(.*)')
        product_loader.add_xpath(
            'sku',
            u'//td[@class="ProductPageSummaryTableInfo" and preceding-sibling::td[@class="ProductPageSummaryTable" and contains(text(),"Model Number")]]/text()'
        )
        yield product_loader.load_item()
Ejemplo n.º 52
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath(
            'name',
            '//h1[@id="top_product_info_block_product_title_text"]/text()')
        loader.add_xpath(
            'sku',
            '//ul[@id="top_product_info_block_product_data_list"]/li/strong/text()'
        )
        loader.add_xpath(
            'price',
            '//p[@id="top_product_info_block_product_data_new_low_price"]/text()'
        )
        yield loader.load_item()
Ejemplo n.º 53
0
    def parse(self, response):
        URL_BASE = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # categories
        category_urls = hxs.select("//div[@id='nav']/ul/li/a/@href").extract()
        for url in category_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url)

        # subcategories
        subcategory_urls = hxs.select(
            "//div[@id='side-nav']/ul/li//a/@href").extract()
        for url in subcategory_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url)

        # products list
        products = hxs.select("//div[contains(@class, 'prodListItemWrapIn')]")
        if not products:
            print "ERROR!! NO PRODUCTS!! %s " % response.url
            logging.error("ERROR!! NO PRODUCTS!! %s " % response.url)
        for product_el in products:
            name = product_el.select(
                ".//h2[@class='prodTitle']/a/text()").extract()
            if not name:
                print "ERROR!! NO NAME!! %s " % response.url
                logging.error("ERROR!! NO NAME!! %s " % response.url)
                continue
            name = name[0]

            url = product_el.select(
                ".//h2[@class='prodTitle']/a/@href").extract()
            if not url:
                print "ERROR!! NO URL!! %s" % response.url
                logging.error("ERROR!! NO URL!! %s " % response.url)
                continue
            url = url[0]
            url = urljoin_rfc(URL_BASE, url)

            price = product_el.select(
                ".//div[@class='prodPrice']//text()").extract()
            if not price:
                print "ERROR!! NO PRICE!! %s" % response.url
                logging.error("ERROR!! NO PRICE!! %s " % response.url)
                continue
            price = price[0]

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', '')
            yield loader.load_item()
Ejemplo n.º 54
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath(
            'name',
            '//font[@class="productnamecolorLARGE colors_productname"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'price', '//font[@class="pricecolor colors_productprice"]/text()')
        sku = (''.join(
            hxs.select(
                '//span[@class="product_code"]/text()').extract()).strip())
        # sku = [x.strip() for x in sku if x.strip()]
        sku = sku[3:]
        loader.add_value('sku', sku)
        # loader.add_value('sku', "the_sku")

        yield loader.load_item()
Ejemplo n.º 55
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_el = hxs.select(
            "/html/body/table/tr/td[2]/table[position()=last()-1]/tr/td[1]")
        name = product_el.select(
            "p[@class='normale']/span[@class='emphasis']/text()").extract()
        if not name:
            print "ERROR!! NO NAME!! %s" % response.url
            logging.error("ERROR!! NO NAME!! %s" % response.url)
            return
        name = name[0]

        url = response.url

        items_title_pos, accessories_title_pos = None, None
        prices_table = product_el.select("table[last()]/form")
        for row in prices_table.select("tr"):
            col = row.select("td[@class='normalreverse']")
            if col:
                if col.select("strong[text()='Items']"):
                    items_title_pos = len(
                        row.select("preceding-sibling::tr").extract()) + 1
                elif col.select("strong[text()='Suggested Accessories']"):
                    accessories_title_pos = len(
                        row.select("preceding-sibling::tr").extract()) + 1
        if items_title_pos is not None:
            if accessories_title_pos is not None:
                sub_cond = " and position()<%d" % accessories_title_pos
            else:
                sub_cond = ""
            sub_items = prices_table.select("tr[position()>%d%s]" %
                                            (items_title_pos, sub_cond))
            for row in sub_items:
                sub_name = row.select("td[1]/text()").extract()
                if not sub_name:
                    print "ERROR!! NO NAME!! %s" % response.url
                    logging.error("ERROR!! NO NAME!! %s" % response.url)
                    continue
                sub_name = sub_name[0]
                price = row.select("td[3]/text()").extract()
                if not price:
                    print "ERROR!! NO PRICE!! %s" % response.url
                    logging.error("ERROR!! NO PRICE!! %s" % response.url)
                    continue
                price = price[0]

                product = Product()
                loader = ProductLoader(item=product, response=response)
                loader.add_value('url', url)
                loader.add_value('name', "%s - %s" % (name, sub_name))
                loader.add_value('price', price)
                loader.add_value('sku', '')
                yield loader.load_item()
Ejemplo n.º 56
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        URL_BASE = get_base_url(response)
        next_page = hxs.select("//table[contains(@class, 'prevnextbar')]/tr/td[@class='next']/a/@href").extract()
        if next_page:
            new_url = urljoin_rfc(URL_BASE, next_page[0])
            request = Request(
                url=new_url,
                callback=self.parse_items
            )
            yield request

        items = hxs.select("//table[@class='hotellist']/tr[contains(@class, 'flash_deal_soldout')]")
        for item in items:
            # getting hotel information
            hotel_name = item.select("td[2]/h3/a/text()").extract()[0]
            hotel_url = item.select("td[2]/h3/a/@href").extract()[0]
            hotel_url = urljoin_rfc(URL_BASE, hotel_url)

            # getting rooms list
            rooms = []
            price_rows = item.select("td[2]/div/form/table/tbody/tr")
            for price_row in price_rows:
                max_persons = price_row.select("td[@class='maxPersons']/div/span[@class='hideme']/text()").extract()[0]
                if int(max_persons) != 2:
                    # skip if room is not for 2 persons
                    continue
                room_name = price_row.select("td[@class='roomName']/div/a/text()").extract()[0]
                room_url = price_row.select("td[@class='roomName']/div/a/@href").extract()[0]
                room_price = price_row.select("td[@class='roomPrice']/div/strong[contains(@class, 'price')]/text()").re("[\d.]+")
                if not room_price:
                    logging.error("NO PRICE! '%s' %s" % (response.url, hotel_name))
                    continue
                room_price = Decimal(room_price[0])
                rooms.append({
                    'name': room_name,
                    'url': room_url,
                    'price': room_price
                })

            # searching for room with minimum price
            if rooms:
                room_min = rooms[0]
                for room in rooms:
                    if room['price'] < room_min['price']:
                        room_min = room

                l = ProductLoader(item=Product(), response=response)
                l.add_value('name', hotel_name.encode('ascii', 'replace'))
                l.add_value('identifier', hotel_name.encode('ascii', 'replace'))
                l.add_value('url', hotel_url)
                l.add_value('price', room_min['price'])
                yield l.load_item()
Ejemplo n.º 57
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//b[contains(text(), "\xa3")]/../..')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './b/font/text()')
            product_loader.add_value('url', response.url)
            price = product.select(
                u'.//b[contains(text(), "\xa3")]/text()').re('\xa3(.*[0-9])')
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Ejemplo n.º 58
0
    def parseProductOption(self, response):
        hxs = HtmlXPathSelector(response)
        # parse title for SKU
        title = hxs.select("//title/text()").extract()[0]
        name = title.split("|")[0]

        sku = response.meta['sku']
        price = response.meta['price']

        product = Product()
        loader = ProductLoader(item=product, response=response, selector=hxs)
        loader.add_value('identifier', sku)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_value('price', price)

        loader.add_value('sku', sku)

        yield loader.load_item()
Ejemplo n.º 59
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        special_price_url = hxs.select(
            "//td[@class='tdcf10bk']/a/@href").extract()
        if special_price_url:
            special_price_url = special_price_url[0]
            special_price_url = urljoin_rfc(get_base_url(response),
                                            special_price_url)

            request = Request(url=special_price_url,
                              callback=self.parse_product)
            request.meta['sku'] = response.meta['sku']
            request.meta['mfrgid'] = response.meta['mfrgid']
            request.meta['search_q'] = response.meta['search_q']
            yield request
            return

        product_loader = ProductLoader(item=Product(), response=response)

        name = hxs.select("//h1/font/b/text()").extract()
        price = hxs.select("//font[@color='#990000']/b/text()").extract()
        if not name or not price:
            retry_count = self.retry_urls.get(response.url, 0)
            retry_count += 1
            if retry_count > 100:
                self.log("ERROR MAX retry count reached (100), giving up...")
                return
            else:
                self.log(
                    "ERROR parsing HTML, adding to retry queue (#{})".format(
                        retry_count))
                self.retry_urls[response.url] = retry_count
                request = Request(url=response.url,
                                  callback=self.parse_product,
                                  dont_filter=True)
                request.meta['sku'] = response.meta['sku']
                request.meta['mfrgid'] = response.meta['mfrgid']
                request.meta['search_q'] = response.meta['search_q']
                yield request
                return
        else:
            product_loader.add_value('name', name[0])
            product_loader.add_value('price', price[0])
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', response.meta['sku'].lower())
            product_loader.add_xpath('identifier',
                                     '//form/input[@name="PID"]/@value')
            yield product_loader.load_item()
Ejemplo n.º 60
0
    def parse(self, response):
        URL_BASE = get_base_url(response)

        hxs = HtmlXPathSelector(response)

        pages_urls = hxs.select(
            "//div[contains(@class, 'pagination')]/a/@href").extract()
        for url in pages_urls:
            url = urljoin_rfc(URL_BASE, url)
            yield Request(url)

        products_els = hxs.select(
            "//li[contains(@class, 'product')]/div[@class='product-details']")
        for product_el in products_els:
            name = product_el.select(
                "div[contains(@class, 'product-name')]/h3/a/text()").extract()
            if not name:
                logging.error('ERROR!! NO NAME!! %s' % response.url)
                continue
            name = " ".join(name)

            url = product_el.select(
                "div[contains(@class, 'product-name')]/h3/a/@href").extract()
            if not url:
                logging.error('ERROR!! NO URL!! %s %s' % (response.url, name))
                continue
            url = url[0]
            url = urljoin_rfc(URL_BASE, url)

            price = product_el.select(
                "div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pounds']/text()"
            ).extract()
            price2 = product_el.select(
                "div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pence']/text()"
            ).extract()
            if not price:
                logging.error('ERROR!! NO PRICE!! %s %s' %
                              (response.url, name))
                continue
            price = price[0]
            if price2:
                price += "." + price2[0]

            product = Product()
            loader = ProductLoader(item=product, response=response)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)

            loader.add_value('sku', '')

            yield loader.load_item()