コード例 #1
0
 def parse_cart(self, response):
     item_id = response.meta['item_id']
     price = response.xpath('//table[@width="980"]//tr[not(@class) and '
                    './/input[@name="Qty%s"]]/td[@align="right" and @valign="top"]/text()' % item_id)\
             .re(r'[\d\.,]+')
     if price:
         loader = ProductLoader(response.meta['product'], response=response)
         loader.add_value('price', price)
         yield loader.load_item()
コード例 #2
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//div[contains(@class, "productResult")]')
        log.msg(">>>>>>>> FOUND %s ITEMS >>>" % len(products))
        for product in products:
            product_loader = TigerChefLoader(Product(), product, spider_name=self.name)
            product_loader.add_xpath(
                    'name', './/h2[@class="productResultName"]/a/text()')
            try:
                name = product.select('.//h2[@class="productResultName"]/a/text()').extract()[0]
            except:
                self.log('Cannot find name %s' % response.url)
            url = product.select(
                    './/h2[@class="productResultName"]/a/@href'
                    ).extract()[0]
            url = canonicalize_url(urljoin_rfc(base_url, url))
            price = ' '.join(product.select(
                    './/span[@class="variantprice"]//text()').extract())
            identifier = identifier_regex.search(url).group(1)
            yield Request(url, callback=self.parse_product, meta={'name': name,
                                                                  'price': price,
                                                                  'identifier': identifier})

        products2 = hxs.select('//div[contains(@id, "ageContent_pnlContent")]/table/tr/td/table/tr[2]/td/a/@href').extract()
        for url in products2:
            identifier = identifier_regex.search(url).group(1)
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'identifier': identifier})

        if not products and not products2 and not hxs.select('//td[@id="featuredProductsTable"]'):
            retry = int(response.meta.get('retry', 0))
            if retry < 10:
                self.log('WARNING: No products and no subcategories, Retry => %s' % response.url)
                retry += 1
                new_meta = response.meta.copy()
                new_meta['retry'] = retry
                yield Request(
                    response.url,
                    meta=new_meta,
                    cookies={'pagesize': 10000},
                    callback=self.parse_products,
                    dont_filter=True)
            else:
                self.log('ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}'.format(response.url))
コード例 #3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta

        sku = hxs.select('.//span[@itemprop="sku"]/text()').extract()
        if not sku:
            return

        sku = sku[0].strip()

        name = meta.get('name', None)
        if not name:
            name = ''.join(hxs.select('//span[@itemprop="name"]/text()').extract())

        brand_r = re.search(r'by (.*)$', name)

        if brand_r:
            brand = brand_r.group(1)
        else:
            if sku in name:
                try:
                    brand = re.search(r'^(.*) %s' % re.escape(sku), name).groups()[0].strip()
                except AttributeError:
                    brand = ''
            else:
                brand = ''

        if not brand:
            brand = response.xpath('//span[@itemprop="manufacturer"]/text()').extract()
            brand = brand[0].strip() if brand else ''

        product_loader = TigerChefLoader(Product(), response=response, spider_name=self.name)
        product_loader.add_value('name', name)
        if 'identifier' in meta:
            product_loader.add_value('identifier', meta['identifier'])
        elif 'item' in meta and 'identifier' in meta['item']:
            product_loader.add_value('identifier', meta['item']['identifier'])
        price = meta.get('price', None)
        if not price:
            price = hxs.select('//div[@itemprop="price"]/span/span/text()').extract()
        if not price:
            price = hxs.select('//div[@itemprop="price"]/span/text()').extract()
        product_loader.add_value('price', price or '0')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', sku)
        category = hxs.select('//span[@class="SectionTitleText"]/li/a/text()')
        category = category[-1].extract() if category else ''
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)

        image_url = hxs.select('//div[@id="prodImageMediumBox"]//div/div/img/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        product_loader.add_value('image_url', image_url)

        sold_as = hxs.select('//table[@id="prodInfo"]/tr/td[div/div[@itemprop="price"]]/span[@class="details"]/text()').extract()
        product = product_loader.load_item()
        metadata = TigerChefMeta()
        metadata['sold_as'] = ' '.join(sold_as[0].replace('/', '').split()) if sold_as else ''
        product['metadata'] = metadata

        yield product
コード例 #4
0
    def parse_product(self, response):
        product = response.meta['product']

        product_loader = ProductLoader(Product(product), response=response)
        product_loader.add_xpath(
            'price', '//meta[@property="og:price:amount"]/@content')
        product_loader.add_value('price', 0)

        name = response.xpath(
            '//div[@class="product-info"]/p[@class="h1"]/text()').extract()

        img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract()
        if not img_url:
            self.log("ERROR img not found")
        else:
            product_loader.add_value('image_url', img_url[0])

        category = response.xpath(
            '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract()
        if not category:
            self.log("ERROR category not found")
        else:
            product_loader.add_value('category', category[-1])

        brand = response.xpath('//div[@class="logo-area"]/a/@title').extract()
        if not brand:
            brand = response.xpath(
                '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()'
            ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            product_loader.add_value('brand', brand[0])

        product = product_loader.load_item()

        if name:
            product['name'] = name[0].strip()

        sold_as = response.xpath(
            '//strong[@class="price"]/span/text()').extract()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea'
        product['metadata'] = metadata

        yield product
コード例 #5
0
    def parse(self, response):
        # Main categories
        for cat_url in response.xpath(
                '//ul[@id="main-nav"]/li/a/@href').extract():
            yield Request(response.urljoin(cat_url))

        sub_categories = response.xpath(
            '//div[contains(@class, "sub-categories")]'
            '/div/div//p/a/@href').extract()
        for sub_cat in sub_categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(sub_cat), 'sort',
                                         'lowest'))

        categories = response.xpath(
            '//ul[@class="category"]/li/a/@href').extract()
        categories += response.xpath(
            '//a[contains(@class, "shop-all-button")]/@href').extract()
        categories += response.css('.subcat-panel ::attr(href)').extract()
        for url in categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(url), 'sort',
                                         'lowest'))

        next_page = response.xpath(
            '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract()
        if next_page:
            yield Request(url=response.urljoin(next_page[0]))

        products = response.xpath('//div[contains(@class, "product")]')
        for product_xs in products:
            url = product_xs.xpath('a/@href').extract()
            if not url:
                continue
            product_loader = ProductLoader(item=Product(), selector=product_xs)
            product_loader.add_value('url', url)

            try:
                sku = product_xs.xpath('p[@class="product-sku"]/text()').re(
                    'KaTom #: (.*)')[0]
            except:
                sku = None
            product_loader.add_value('sku', sku)
            product_loader.add_value('identifier', sku)
            product_loader.add_xpath('name', 'a/@title')
            product_loader.add_css('image_url', '.img ::attr(src)')
            product_loader.add_xpath('category', '//h1[@class="title"]/text()')

            product = product_loader.load_item()
            if len(product.get('sku', '').split('-')) > 1:
                product['sku'] = '-'.join(product['sku'].split('-')[1:])

            yield Request(url=product_loader.get_output_value('url'),
                          meta={"product": product},
                          callback=self.parse_product)
コード例 #6
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta

        products = hxs.select('//li[contains(@itemtype, "Product")]')
        for product in products:
            product_loader = ProductLoader(Product(),
                                           product,
                                           spider_name=self.name)
            product_loader.add_xpath('name', './/a[@itemprop="name"]/text()')
            product_loader.add_xpath('url', './/a[@itemprop="name"]/@href')
            product_loader.add_xpath('price',
                                     './/span[@itemprop="price"]/text()')
            product_loader.add_xpath('image_url', 'div/a/img/@src')
            identifier = product.select('@id').extract()[0].split(
                'product_')[-1]
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('category', meta.get('category'))
            product_loader.add_value('brand', meta.get('brand'))

            sku = product.select('.//span[@itemprop="model"]/text()')
            if sku:

                sku = sku.extract()[0]
                '''
                dash_pos = sku.find('-')
                if dash_pos >= 0:
                    sku = sku[dash_pos + 1:]
                '''
                product_loader.add_value('sku', sku)

            sold_as = product.select(
                'div/div/div/div/span[contains(text(), "Sold As")]/text()'
            ).extract()
            product = product_loader.load_item()

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].split(
                'Sold As: ')[-1].strip() if sold_as else '1 ea'
            product['metadata'] = metadata

            yield product

        next_page = hxs.select(
            '//td[@class="next"]/a[@class="pagerlink"]/@href').extract()
        if next_page:
            yield Request(urljoin_rfc(base_url, next_page[0]),
                          callback=self.parse_products,
                          meta=meta)
コード例 #7
0
    def parse(self, response):
        next_page = response.xpath(
            '//*[@class="pagelinks"]/following-sibling::td//a[contains(text(), "Next")]/@href'
        ).extract()
        if next_page:
            yield Request(response.urljoin(next_page[0]),
                          meta={'dont_merge_cookies': True},
                          dont_filter=True)

        if not next_page:
            self._search_done = True

        products_xs = response.xpath('//td[contains(@class, "search-prod")]')
        for product_xs in products_xs:
            sku = None
            product_id = product_xs.xpath(
                './/*[@class="search-item-title"]/a/@href').extract()[0].split(
                    '/')[-1].split('.')[2]
            try:
                brand, sku = product_xs.xpath(
                    './/*[@class="search-item-title"]/following-sibling::div/a/text()'
                ).extract()
            except ValueError:
                try:
                    brand = product_xs.xpath(
                        './/*[@class="search-item-title"]/following-sibling::div/a/text()'
                    ).extract()[0]
                except:
                    brand = None
            image_url = map(response.urljoin,
                            product_xs.xpath('.//img/@src').extract())
            price = product_xs.xpath('.//*[@class="search-item-price"]').re(
                r'[\d\.,]+')
            add_to_cart = bool(
                product_xs.xpath(
                    './/*[@class="search-item-price"]/span[@class="see-price-sprite"]'
                ))
            loader = ProductLoader(item=Product(), selector=product_xs)
            identifier = product_id
            if sku:
                identifier = identifier + ' ' + sku.lower()

            loader.add_value('identifier', identifier)
            if sku:
                loader.add_value('sku', sku)
            loader.add_xpath('url', './/*[@class="search-item-title"]/a/@href')
            if image_url:
                loader.add_value(
                    'image_url',
                    image_url[0].replace('/pics/sm/',
                                         '/pics/md/').replace('sm_', 'md_'))
            if brand:
                loader.add_value('brand', brand)
            loader.add_xpath(
                'name', './/*[@class="search-item-title"]/a/strong/text()')
            if price:
                loader.add_value('price', price[0])
                yield loader.load_item()
            elif add_to_cart:
                product = loader.load_item()
                url = response.urljoin(
                    product_xs.xpath(
                        './/a[@class="atc-primary"]/@href').extract()[0])
                item_id = url_query_parameter(url, 'ItemID')
                self._add_to_cart_products.append((item_id, url, product))
コード例 #8
0
    def parse_product(self, response):
        schema = SpiderSchema(response)
        data = schema.get_product()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', data['Name'])
        loader.add_xpath('category',
                         u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()')
        price = response.xpath(
            '//form[@id="productform"]/input[@name="price"]/@value').extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value(
                'price',
                data.get('offers', {}).get('properties',
                                           {}).get('price', '0.0'))

        sku = map(
            unicode.strip,
            response.xpath(
                '//span[contains(@class, "mfr-number")]/text()').extract())
        loader.add_value('identifier', data['productID'])
        if sku:
            loader.add_value('sku', sku)
        else:
            loader.add_value('sku', data['productID'].replace('#', ''))

        image_url = data.get('image', '').replace('www.example.com',
                                                  'www.webstaurantstore.com')
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))

        brand = data.get('brand', '')
        if not brand:
            brand = response.xpath(
                '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()'
            ).extract()
            brand = brand[0].strip() if brand else ''

        if brand:
            loader.add_value('brand', brand)

        sold_as = response.xpath(
            '//div[@id="subject"]/div/div/p/span[@class="each"]/text()'
        ).extract()

        product = loader.load_item()
        if product.get('identifier', '').strip() != '':
            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].replace('/',
                                                     '') if sold_as else ''
            product['metadata'] = metadata

            # Add to cart to see the price
            if response.xpath(
                    '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]'
            ):
                cart_url = 'http://www.webstaurantstore.com/viewcart.html'
                inputs = response.xpath('//form[@id="productform"]/input')
                formdata = dict(
                    zip(
                        inputs.select('./@name').extract(),
                        inputs.select('./@value').extract()))
                # quantity
                formdata[u'qty'] = '1'
                f_request = FormRequest(url=cart_url,
                                        method='POST',
                                        formdata=formdata,
                                        callback=self.parse_price,
                                        meta={
                                            'product': product,
                                            'dont_merge_cookies': True
                                        },
                                        dont_filter=True)

                yield f_request
            else:
                yield product  # loader.load_item()
コード例 #9
0
ファイル: ckitchen.py プロジェクト: oceancloud82/scraping
    def parse_product(self, response):
        page_schema = SpiderSchema(response)
        product_data = page_schema.get_product()

        sku = product_data['sku']
        main_name = product_data['name']
        main_price = extract_price(
            product_data['offers']['properties']['price'].replace(' ', ''))
        brand = product_data['brand']
        image_url = product_data['image']
        category = [
            d['properties']['name'] for d in page_schema.data['items'][1]
            ['properties']['itemListElement']
        ][0]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', main_name)
        loader.add_value('identifier', sku)
        loader.add_value('price', main_price)
        loader.add_value('sku', sku)
        loader.add_value('brand', brand)
        loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)

        description = ' '.join(
            response.xpath('//*[@class="product-details"]//text()').extract())
        sold_as = ''
        if 'Priced per' in description:
            sold_as = description.split('Priced per')[1]
        if 'Priced by' in description:
            sold_as = description.split('Priced by')[1]
        if 'Price per' in description:
            sold_as = description.split('Price per')[1]
        if ';' in sold_as:
            sold_as = sold_as.split(';')[0]
        if '.' in sold_as:
            sold_as = sold_as.split('.')[0]
        if ',' in sold_as:
            sold_as = sold_as.split(',')[0]

        if not sold_as:
            sold_as = 'each'

        product = loader.load_item()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as
        product['metadata'] = metadata

        yield product

        for option in self._parse_options(response, product):
            yield option
コード例 #10
0
    def parse_product(self, response):
        itemno = response.xpath(
            '//div[@id="product-main-info"]//a[contains(@id, '
            '"wishlist_link_")]/@id').re(r'(\d+)')
        if not itemno:
            self.log('ERROR: itemno not found => %s' % response.url)
            return
        else:
            itemno = itemno[0]

        price = ''.join(
            response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+')
            [-2:])
        if not price:
            self.log('WARNING: price not found => %s' % response.url)
            price = '0.00'

        sku = response.xpath('//li[@itemprop="sku"]/text()').extract()
        if not sku:
            self.log('WARNING: SKU not found => %s' % response.url)
        else:
            sku = sku[0].replace('Model #:', '').strip()

        brand = response.xpath('//li[@itemprop="name"]/text()').extract()
        image_url = response.xpath(
            '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract()
        category = response.xpath('//span[@class="breadcrumb-element"]'
                                  '//*[@itemprop="name"]/text()').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        loader.add_value('price', price)
        if sku:
            loader.add_value('sku', sku)
        if image_url:
            loader.add_value('image_url', image_url)
        if brand:
            loader.add_value('brand', brand)
        loader.add_value('identifier', itemno + ' ' + sku)
        if category:
            loader.add_value('category', category[0].strip())

        product = loader.load_item()

        sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\
                          .extract()[0].strip()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as
        product['metadata'] = metadata

        yield product
コード例 #11
0
    def parse_products(self, response, hxs):
        products = response.css('.product-result')
        for product in products:
            loader = ProductLoader(selector=product, item=Product(), spider_name=self.name)
            url = product.select('.//a/@href').extract()
            if not url:
                self.log('ERROR: no product URL found! URL:{}'.format(response.url))
                continue
            else:
                url = urljoin_rfc(get_base_url(response), url[0])
                loader.add_value('url', url)

            sku = product.select('.//a/text()').re('\((.*?)\)')
            if not sku:
                self.log('ERROR: no SKU found!')
            else:
                loader.add_value('sku', sku[0])
                product_id = product.select('.//a/@href').re('p(\d+)\.aspx')
                if not product_id:
                    self.log('ERROR: no product ID found!')
                else:
                    loader.add_value('identifier', product_id[0] + '_' + sku[0])
            product_image = product.select('.//a/img/@psrc').extract()
            if not product_image:
                product_image = product.select('.//div/img/@src').extract()
                if not product_image:
                    self.log('ERROR: no product Image found!')
            if product_image:
                image = urljoin_rfc(get_base_url(response), product_image[0].strip())
                loader.add_value('image_url', image)
            price = ''.join(product.select('./div[contains(@class,"-price")]/text()').extract()).strip()
            check_cart = False
            if 'Instant Rebate' in price or 'Add to Cart' in price:
                price = '0.0'
                check_cart = True
            if not price:
                price = ''.join(product.select('./div[contains(@class,"-price")]/span/text()').extract()).strip()
                if not price:
                    self.log('ERROR: no price found! URL:{} Product URL:{}'.format(response.url, url))
                    continue
            loader.add_value('price', price.strip())
            category = product.select('//div[contains(@class, "content")]/h1/text()').extract()
            if not category:
                self.log("ERROR: category not found")
            else:
                loader.add_value('category', category[0].strip())

            name = product.select('.//a/text()').extract()[0]
            loader.add_value('name', name)

            brand = name.split(' (')[0]

            loader.add_value('brand', brand)

            sold_as = product.select('div//span[@class="unit-of-sale"]/text()').extract()
            sold_as = sold_as[0].split('/')[-1] if sold_as else '1 ea'

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as

            if check_cart:
                sku_id = product.select('div[@class="adcWinnowedItem"]/button/@atc-skuid').extract()[0]
                add_cart_url = "https://www.foodservicewarehouse.com/ViewCart/AddSkuToCart?skuID=" + sku_id + "&quantity=1"
                req = Request(add_cart_url, dont_filter=True, callback=self.parse_cart, meta={'loader':loader, 'metadata':metadata, 'sku_id': sku_id})
                req.meta['proxy'] = self.CART_PROXY
                yield req
                req = Request('https://www.foodservicewarehouse.com/ViewCart/RemoveAll/', dont_filter=True, callback=self.parse_cart, meta={'clean_cart':True})
                req.meta['proxy'] = self.CART_PROXY
                yield req
            else:
                product = loader.load_item()
                product['metadata'] = metadata
                yield product
コード例 #12
0
 def parse(self, response):
     data = response.xpath('//script/text()').re("products', (\[{.+}\])")
     if not data:
         return
     list_of_data = json.loads(data[0])
     for data in list_of_data:
         loader = ProductLoader(item=Product(), response=response, spider_name=self.name)
         loader.add_xpath('url', '//link[@rel="canonical"]/@href')
         loader.add_value('sku', data['sku'])
         loader.add_value('identifier', str(data['sqlProductID']) + '_' + data['sku'])
         loader.add_value('name', data['name'])
         loader.add_value('price', data['price'])
         category = response.css('.breadcrumb a::text').extract()
         loader.add_value('category', category[-1])
         loader.add_value('brand', data['manufacturer'])
         loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
         loader.add_value('stock', int(data['inventoryStatus'] != 3))
         yield loader.load_item()
コード例 #13
0
    def parse_product(self, response):

        # self.log("parse_product")

        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1[@id="partNameId"]/text()').extract()

        quantity = hxs.select(
            '//label[@class="productdetail-qtytxt"]/../text()[last()]'
        ).extract()
        if quantity:
            quantity = quantity[0].replace('\n',
                                           ' ').replace('\r', ' ').replace(
                                               '\t', ' ').strip()
            quantity = re.sub(' +', ' ', quantity)

        loader = ProductLoader(response=response,
                               item=Product(),
                               spider_name=self.name)

        if not name:
            self.log("ERROR name not found")
        else:
            loader.add_value('name', name[0].strip())

        brand = hxs.select(
            '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()'
        ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            loader.add_value("brand", brand[0].strip())

        img_url = hxs.select(
            '//div[@class="productdetail-productimage"]/a/img/@src').extract()
        if not img_url:
            self.log("ERROR img_url not found")
        else:
            loader.add_value("image_url", img_url[0])

        category = hxs.select(
            '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()'
        ).extract()
        if not category:
            self.log("ERROR category not found")
        else:
            loader.add_value("category", category[0].strip())

        # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip())

        if quantity and quantity.lower() != 'each':
            loader.add_value('name', quantity)

        loader.add_value('url', response.url)
        loader.add_xpath('price',
                         '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(
            hxs.select('//b[contains(text(), "Model #:")]/../text()').extract(
            )).strip()
        temp = sku.split()
        if len(temp) == 2 and temp[0] == temp[1]:
            sku = temp[0]
        loader.add_value('sku', sku)
        loader.add_xpath('identifier',
                         '//form//input[@name="productId"]/@value')

        product = loader.load_item()

        metadata = TigerChefMeta()
        metadata['sold_as'] = quantity if quantity else '1 ea'
        product['metadata'] = metadata

        yield product