Beispiel #1
0
    def spider_closed(self, spider):

        self.log('Loading remaining products')
        for p in self.all_prod_data:
            if p in seen:
                continue
            pr = self.all_prod_data[p]
            loader = ProductLoader(item=Product(), selector=HtmlXPathSelector())
            loader.add_value('identifier', pr['identifier'].decode('utf8'))
            loader.add_value('brand', pr['brand'].decode('utf8'))
            loader.add_value('category', pr['category'].decode('utf8'))
            loader.add_value('url', pr['url'].decode('utf8'))
            loader.add_value('name', pr['name'].decode('utf8'))
            loader.add_value('sku', pr['sku'].decode('utf8'))
            loader.add_value('image_url', pr['image_url'].decode('utf8'))
            loader.add_value('price', pr['price'])
            product = loader.load_item()
            if p in self.sold_as:
                meta = TigerChefMeta()
                meta['sold_as'] = self.sold_as[p].decode('utf8')
                product['metadata'] = meta

            yield product

        shutil.copy('data/%s_products.csv' % spider.crawl_id, self.all_products_file)
Beispiel #2
0
    def parse_product(self, response):
        page_schema = SpiderSchema(response)
        product_data = page_schema.get_product()

        sku = product_data['sku']
        main_name = product_data['name']
        main_price = extract_price(
            product_data['offers']['properties']['price'].replace(' ', ''))
        brand = product_data['brand']
        image_url = product_data['image']
        category = [
            d['properties']['name'] for d in page_schema.data['items'][1]
            ['properties']['itemListElement']
        ][0]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', main_name)
        loader.add_value('identifier', sku)
        loader.add_value('price', main_price)
        loader.add_value('sku', sku)
        loader.add_value('brand', brand)
        loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)

        description = ' '.join(
            response.xpath('//*[@class="product-details"]//text()').extract())
        sold_as = ''
        if 'Priced per' in description:
            sold_as = description.split('Priced per')[1]
        if 'Priced by' in description:
            sold_as = description.split('Priced by')[1]
        if 'Price per' in description:
            sold_as = description.split('Price per')[1]
        if ';' in sold_as:
            sold_as = sold_as.split(';')[0]
        if '.' in sold_as:
            sold_as = sold_as.split('.')[0]
        if ',' in sold_as:
            sold_as = sold_as.split(',')[0]

        if not sold_as:
            sold_as = 'each'

        product = loader.load_item()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as
        product['metadata'] = metadata

        yield product

        for option in self._parse_options(response, product):
            yield option
    def parse_product(self, response):
        itemno = response.xpath(
            '//div[@id="product-main-info"]//a[contains(@id, '
            '"wishlist_link_")]/@id').re(r'(\d+)')
        if not itemno:
            self.log('ERROR: itemno not found => %s' % response.url)
            return
        else:
            itemno = itemno[0]

        price = ''.join(
            response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+')
            [-2:])
        if not price:
            self.log('WARNING: price not found => %s' % response.url)
            price = '0.00'

        sku = response.xpath('//li[@itemprop="sku"]/text()').extract()
        if not sku:
            self.log('WARNING: SKU not found => %s' % response.url)
        else:
            sku = sku[0].replace('Model #:', '').strip()

        brand = response.xpath('//li[@itemprop="name"]/text()').extract()
        image_url = response.xpath(
            '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract()
        category = response.xpath('//span[@class="breadcrumb-element"]'
                                  '//*[@itemprop="name"]/text()').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        loader.add_value('price', price)
        if sku:
            loader.add_value('sku', sku)
        if image_url:
            loader.add_value('image_url', image_url)
        if brand:
            loader.add_value('brand', brand)
        loader.add_value('identifier', itemno + ' ' + sku)
        if category:
            loader.add_value('category', category[0].strip())

        product = loader.load_item()

        sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\
                          .extract()[0].strip()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as
        product['metadata'] = metadata

        yield product
Beispiel #4
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta

        products = hxs.select('//li[contains(@itemtype, "Product")]')
        for product in products:
            product_loader = ProductLoader(Product(),
                                           product,
                                           spider_name=self.name)
            product_loader.add_xpath('name', './/a[@itemprop="name"]/text()')
            product_loader.add_xpath('url', './/a[@itemprop="name"]/@href')
            product_loader.add_xpath('price',
                                     './/span[@itemprop="price"]/text()')
            product_loader.add_xpath('image_url', 'div/a/img/@src')
            identifier = product.select('@id').extract()[0].split(
                'product_')[-1]
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('category', meta.get('category'))
            product_loader.add_value('brand', meta.get('brand'))

            sku = product.select('.//span[@itemprop="model"]/text()')
            if sku:

                sku = sku.extract()[0]
                '''
                dash_pos = sku.find('-')
                if dash_pos >= 0:
                    sku = sku[dash_pos + 1:]
                '''
                product_loader.add_value('sku', sku)

            sold_as = product.select(
                'div/div/div/div/span[contains(text(), "Sold As")]/text()'
            ).extract()
            product = product_loader.load_item()

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].split(
                'Sold As: ')[-1].strip() if sold_as else '1 ea'
            product['metadata'] = metadata

            yield product

        next_page = hxs.select(
            '//td[@class="next"]/a[@class="pagerlink"]/@href').extract()
        if next_page:
            yield Request(urljoin_rfc(base_url, next_page[0]),
                          callback=self.parse_products,
                          meta=meta)
Beispiel #5
0
    def parse_product(self, response):
        product = response.meta['product']

        product_loader = ProductLoader(Product(product), response=response)
        product_loader.add_xpath(
            'price', '//meta[@property="og:price:amount"]/@content')
        product_loader.add_value('price', 0)

        name = response.xpath(
            '//div[@class="product-info"]/p[@class="h1"]/text()').extract()

        img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract()
        if not img_url:
            self.log("ERROR img not found")
        else:
            product_loader.add_value('image_url', img_url[0])

        category = response.xpath(
            '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract()
        if not category:
            self.log("ERROR category not found")
        else:
            product_loader.add_value('category', category[-1])

        brand = response.xpath('//div[@class="logo-area"]/a/@title').extract()
        if not brand:
            brand = response.xpath(
                '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()'
            ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            product_loader.add_value('brand', brand[0])

        product = product_loader.load_item()

        if name:
            product['name'] = name[0].strip()

        sold_as = response.xpath(
            '//strong[@class="price"]/span/text()').extract()
        metadata = TigerChefMeta()
        metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea'
        product['metadata'] = metadata

        yield product
Beispiel #6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta

        sku = hxs.select('.//span[@itemprop="sku"]/text()').extract()
        if not sku:
            return

        sku = sku[0].strip()

        name = meta.get('name', None)
        if not name:
            name = ''.join(hxs.select('//span[@itemprop="name"]/text()').extract())

        brand_r = re.search(r'by (.*)$', name)

        if brand_r:
            brand = brand_r.group(1)
        else:
            if sku in name:
                try:
                    brand = re.search(r'^(.*) %s' % re.escape(sku), name).groups()[0].strip()
                except AttributeError:
                    brand = ''
            else:
                brand = ''

        if not brand:
            brand = response.xpath('//span[@itemprop="manufacturer"]/text()').extract()
            brand = brand[0].strip() if brand else ''

        product_loader = TigerChefLoader(Product(), response=response, spider_name=self.name)
        product_loader.add_value('name', name)
        if 'identifier' in meta:
            product_loader.add_value('identifier', meta['identifier'])
        elif 'item' in meta and 'identifier' in meta['item']:
            product_loader.add_value('identifier', meta['item']['identifier'])
        price = meta.get('price', None)
        if not price:
            price = hxs.select('//div[@itemprop="price"]/span/span/text()').extract()
        if not price:
            price = hxs.select('//div[@itemprop="price"]/span/text()').extract()
        product_loader.add_value('price', price or '0')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', sku)
        category = hxs.select('//span[@class="SectionTitleText"]/li/a/text()')
        category = category[-1].extract() if category else ''
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)

        image_url = hxs.select('//div[@id="prodImageMediumBox"]//div/div/img/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        product_loader.add_value('image_url', image_url)

        sold_as = hxs.select('//table[@id="prodInfo"]/tr/td[div/div[@itemprop="price"]]/span[@class="details"]/text()').extract()
        product = product_loader.load_item()
        metadata = TigerChefMeta()
        metadata['sold_as'] = ' '.join(sold_as[0].replace('/', '').split()) if sold_as else ''
        product['metadata'] = metadata

        yield product
Beispiel #7
0
    def get_products(self, hxs, url):
        root_url = 'https://www.instawares.com'
        res = []
        products = hxs.select('//ol[starts-with(@class, "productListResultOL")]/li')
        # self.log('%s products found' % len(products))
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/div[@class="listResultsDescriptionDiv"]/a/text()')
            loader.add_xpath('identifier', './/div[@class="listResultsDescriptionDiv"]/dl/dd[1]/text()')
            loader.add_xpath('price', './/div[@class="listResultPrice"]/text()')
            loader.add_xpath('brand', './/div[@class="listResultsDescriptionDiv"]/dl/dt[contains(text(), "By")]/following-sibling::dd/text()')
            url = product.select('.//div[@class="listResultsDescriptionDiv"]/a/@href').extract()[0]
            loader.add_value('url', urljoin_rfc(root_url, url))
            if loader.get_output_value('identifier') in self.prod_data:
                row = self.prod_data[loader.get_output_value('identifier')]
                loader.add_value('brand', row['brand'].decode('utf8'))
                loader.add_value('category', row['category'].decode('utf8'))
                loader.add_value('sku', row['sku'].decode('utf8'))

            image_url = product.select('.//img[@class="productimagelarge"]/@src').extract()
            if image_url:
                image_url = image_url[0]
                loader.add_value('image_url', urljoin_rfc(root_url, image_url))

            p = loader.load_item()
            if p['identifier'] in self.sold_as:
                sold_as = self.sold_as[p['identifier']]
                metadata = TigerChefMeta()
                metadata['sold_as'] = sold_as
                p['metadata'] = metadata

            res.append(loader.load_item())

        if not res and hxs.select('//h1[@class="productName fn"]/text()'):
            loader = ProductLoader(selector=hxs, item=Product(), spider_name=self.name)
            loader.add_value('url', url)
            loader.add_xpath('name', '//h1[@class="productName fn"]/text()')
            loader.add_xpath('price', '//li[@class="price"]//text()')
            loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' +
                                    '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()')
            loader.add_xpath('identifier', '//td[@itemprop="productID"]/text()')

            brand = hxs.select('//td[@class="brand"]/text()').extract()
            if not brand:
                self.log("ERROR brand not found")
            else:
                loader.add_value("brand", brand[0].strip())

            image_url = hxs.select('//div[@class="productImageDiv"]/a/img/@src').extract()
            if not image_url:
                self.log("ERROR image_url not found")
            else:
                loader.add_value("image_url", urljoin_rfc(root_url, image_url[0]))

            category = hxs.select('(//ol[@class="breadcrumbOL"]/a)[last()]/text()').extract()
            if not category:
                self.log("ERROR category not found")
            else:
                loader.add_value("category", category[0].strip())

            sold_as = hxs.select('//dl[@class="soldAsPackedAsDL"]/dd[1]/text()').extract()
            product = loader.load_item()

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].strip() if sold_as else '1 ea'
            product['metadata'] = metadata

            if product.get('identifier'):
                res.append(loader.load_item())

        return res
Beispiel #8
0
    def parse_product(self, response):
        schema = SpiderSchema(response)
        data = schema.get_product()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', data['Name'])
        loader.add_xpath('category',
                         u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()')
        price = response.xpath(
            '//form[@id="productform"]/input[@name="price"]/@value').extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value(
                'price',
                data.get('offers', {}).get('properties',
                                           {}).get('price', '0.0'))

        sku = map(
            unicode.strip,
            response.xpath(
                '//span[contains(@class, "mfr-number")]/text()').extract())
        loader.add_value('identifier', data['productID'])
        if sku:
            loader.add_value('sku', sku)
        else:
            loader.add_value('sku', data['productID'].replace('#', ''))

        image_url = data.get('image', '').replace('www.example.com',
                                                  'www.webstaurantstore.com')
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))

        brand = data.get('brand', '')
        if not brand:
            brand = response.xpath(
                '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()'
            ).extract()
            brand = brand[0].strip() if brand else ''

        if brand:
            loader.add_value('brand', brand)

        sold_as = response.xpath(
            '//div[@id="subject"]/div/div/p/span[@class="each"]/text()'
        ).extract()

        product = loader.load_item()
        if product.get('identifier', '').strip() != '':
            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as[0].replace('/',
                                                     '') if sold_as else ''
            product['metadata'] = metadata

            # Add to cart to see the price
            if response.xpath(
                    '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]'
            ):
                cart_url = 'http://www.webstaurantstore.com/viewcart.html'
                inputs = response.xpath('//form[@id="productform"]/input')
                formdata = dict(
                    zip(
                        inputs.select('./@name').extract(),
                        inputs.select('./@value').extract()))
                # quantity
                formdata[u'qty'] = '1'
                f_request = FormRequest(url=cart_url,
                                        method='POST',
                                        formdata=formdata,
                                        callback=self.parse_price,
                                        meta={
                                            'product': product,
                                            'dont_merge_cookies': True
                                        },
                                        dont_filter=True)

                yield f_request
            else:
                yield product  # loader.load_item()
Beispiel #9
0
    def parse_products(self, response, hxs):
        products = response.css('.product-result')
        for product in products:
            loader = ProductLoader(selector=product, item=Product(), spider_name=self.name)
            url = product.select('.//a/@href').extract()
            if not url:
                self.log('ERROR: no product URL found! URL:{}'.format(response.url))
                continue
            else:
                url = urljoin_rfc(get_base_url(response), url[0])
                loader.add_value('url', url)

            sku = product.select('.//a/text()').re('\((.*?)\)')
            if not sku:
                self.log('ERROR: no SKU found!')
            else:
                loader.add_value('sku', sku[0])
                product_id = product.select('.//a/@href').re('p(\d+)\.aspx')
                if not product_id:
                    self.log('ERROR: no product ID found!')
                else:
                    loader.add_value('identifier', product_id[0] + '_' + sku[0])
            product_image = product.select('.//a/img/@psrc').extract()
            if not product_image:
                product_image = product.select('.//div/img/@src').extract()
                if not product_image:
                    self.log('ERROR: no product Image found!')
            if product_image:
                image = urljoin_rfc(get_base_url(response), product_image[0].strip())
                loader.add_value('image_url', image)
            price = ''.join(product.select('./div[contains(@class,"-price")]/text()').extract()).strip()
            check_cart = False
            if 'Instant Rebate' in price or 'Add to Cart' in price:
                price = '0.0'
                check_cart = True
            if not price:
                price = ''.join(product.select('./div[contains(@class,"-price")]/span/text()').extract()).strip()
                if not price:
                    self.log('ERROR: no price found! URL:{} Product URL:{}'.format(response.url, url))
                    continue
            loader.add_value('price', price.strip())
            category = product.select('//div[contains(@class, "content")]/h1/text()').extract()
            if not category:
                self.log("ERROR: category not found")
            else:
                loader.add_value('category', category[0].strip())

            name = product.select('.//a/text()').extract()[0]
            loader.add_value('name', name)

            brand = name.split(' (')[0]

            loader.add_value('brand', brand)

            sold_as = product.select('div//span[@class="unit-of-sale"]/text()').extract()
            sold_as = sold_as[0].split('/')[-1] if sold_as else '1 ea'

            metadata = TigerChefMeta()
            metadata['sold_as'] = sold_as

            if check_cart:
                sku_id = product.select('div[@class="adcWinnowedItem"]/button/@atc-skuid').extract()[0]
                add_cart_url = "https://www.foodservicewarehouse.com/ViewCart/AddSkuToCart?skuID=" + sku_id + "&quantity=1"
                req = Request(add_cart_url, dont_filter=True, callback=self.parse_cart, meta={'loader':loader, 'metadata':metadata, 'sku_id': sku_id})
                req.meta['proxy'] = self.CART_PROXY
                yield req
                req = Request('https://www.foodservicewarehouse.com/ViewCart/RemoveAll/', dont_filter=True, callback=self.parse_cart, meta={'clean_cart':True})
                req.meta['proxy'] = self.CART_PROXY
                yield req
            else:
                product = loader.load_item()
                product['metadata'] = metadata
                yield product
Beispiel #10
0
    def parse_product(self, response):

        # self.log("parse_product")

        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1[@id="partNameId"]/text()').extract()

        quantity = hxs.select(
            '//label[@class="productdetail-qtytxt"]/../text()[last()]'
        ).extract()
        if quantity:
            quantity = quantity[0].replace('\n',
                                           ' ').replace('\r', ' ').replace(
                                               '\t', ' ').strip()
            quantity = re.sub(' +', ' ', quantity)

        loader = ProductLoader(response=response,
                               item=Product(),
                               spider_name=self.name)

        if not name:
            self.log("ERROR name not found")
        else:
            loader.add_value('name', name[0].strip())

        brand = hxs.select(
            '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()'
        ).extract()
        if not brand:
            self.log("ERROR brand not found")
        else:
            loader.add_value("brand", brand[0].strip())

        img_url = hxs.select(
            '//div[@class="productdetail-productimage"]/a/img/@src').extract()
        if not img_url:
            self.log("ERROR img_url not found")
        else:
            loader.add_value("image_url", img_url[0])

        category = hxs.select(
            '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()'
        ).extract()
        if not category:
            self.log("ERROR category not found")
        else:
            loader.add_value("category", category[0].strip())

        # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip())

        if quantity and quantity.lower() != 'each':
            loader.add_value('name', quantity)

        loader.add_value('url', response.url)
        loader.add_xpath('price',
                         '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(
            hxs.select('//b[contains(text(), "Model #:")]/../text()').extract(
            )).strip()
        temp = sku.split()
        if len(temp) == 2 and temp[0] == temp[1]:
            sku = temp[0]
        loader.add_value('sku', sku)
        loader.add_xpath('identifier',
                         '//form//input[@name="productId"]/@value')

        product = loader.load_item()

        metadata = TigerChefMeta()
        metadata['sold_as'] = quantity if quantity else '1 ea'
        product['metadata'] = metadata

        yield product