Python ProductLoaderWithoutSpaces Beispiele, product_spiders.items.ProductLoaderWithoutSpaces Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: peoplescars.py Projekt: oceancloud82/scraping

    def parse_car(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        identifier = response.url.split('/')[-2]

        price = hxs.select(
            '//td[contains(text(), "Cash Price")]/following-sibling::td/text()'
        ).extract()
        if not price:
            price = hxs.select('//h2/text()').re(
                'Manager\'s Special Price (.*)')
        if not price:
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier)

        name = hxs.select(
            '//div[@class="textInner"][./h2]/*//strong/text()').extract()
        if name:
            name = name[0]
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('price', price)

        yield loader.load_item()

Beispiel #2

0

Datei anzeigen

Datei: furniturechoice_co_uk.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        for url in hxs.select(
                '//div[@class="product-tile"]//a/@href').extract():
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                yield Request(urljoin_rfc(base_url, url),
                              callback=self.parse_product)
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        name = hxs.select('//h1/text()').extract()
        if not name:
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return
        product_loader.add_value('name', name)
        category = hxs.select(
            '//ol[@class="breadcrumbs"]//a/text()').extract()[1:]
        product_loader.add_value('category', category)
        img = hxs.select('//div[@class="item"]//img/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img.pop(0)))

        product = product_loader.load_item()
        options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li')
        if not options:
            options = hxs.select(
                u'//div[@class="SingColl"]/div[contains(@class, "Prod")]')
        if True:
            if not options or len(options) == 1:
                prod = Product(product)
                prod['sku'] = hxs.select('//div[@class="product-sku"]/text()'
                                         ).re('Product code: (\w+)').pop()
                prod['identifier'] = prod['sku']
                prod['price'] = extract_price(
                    hxs.select('//div[@class="price-current"]/text()').extract(
                    ).pop())
                if prod['identifier']:
                    yield prod
            else:
                for opt in options:
                    prod = Product(product)
                    prod['name'] = opt.select(
                        u'normalize-space(.//h2/text())').extract()[0]
                    prod['sku'] = \
                        opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0]
                    prod['identifier'] = prod['sku']
                    prod['price'] = extract_price(
                        opt.select(
                            u'.//span[@class="Price"]/text()').extract()[0])
                    yield prod

Beispiel #3

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        base_url = '/'.join(base_url.split('/')[:3])

        product = {}

        product['identifier'] = response.xpath(
            '//input[@name="elementID"]/@value').extract_first()

        if not response.css('span.product-in-store'):
            product['stock'] = 0

        product['name'] = response.xpath(
            '//h1[@itemprop="name"]/text()').extract_first()

        product['price'] = response.xpath(
            '//meta[@itemprop="price"]/@content').extract_first()

        product['url'] = response.url

        product['brand'] = hxs.select(
            u'//dt[contains(., "Производитель")]/following-sibling::dd/span/text()'
        ).extract_first()
        if not product['brand']:
            product['brand'] = response.xpath('//span/text()').re_first(
                u'Другие товары бренда (.+)')

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            product['image_url'] = urljoin_rfc(base_url, image_url[0].strip())

        product['sku'] = ''
        product['sku'] = response.xpath(
            u'//span[contains(., "Артикул:")]/following-sibling::span/text()'
        ).extract_first()

        product['category'] = hxs.select(
            '//div[contains(@class, "breadcrumbs")]//span/text()').extract(
            )[-2]

        product_loader = ProductLoaderWithoutSpaces(item=Product(),
                                                    selector=hxs)
        for k, v in product.iteritems():
            product_loader.add_value(k, v)
        product = product_loader.load_item()

        #time.sleep(random.random()*2.0)
        yield product

Beispiel #4

0

Datei anzeigen

Datei: courts.py Projekt: oceancloud82/scraping

 def parse_price_from_cart(self, response):
     loader = ProductLoader(item=response.meta['product'],
                            response=response)
     loader.replace_xpath(
         'price',
         '//td[@class="right"]/div[@class="prodetail-price"][1]/text()')
     shipping_cost = 9.9 if loader.get_output_value('price') < 200 else 0
     loader.replace_value('shipping_cost', shipping_cost)
     yield loader.load_item()

Beispiel #5

0

Datei anzeigen

Datei: specsavers.py Projekt: oceancloud82/scraping

    def start_requests(self):
        with open(os.path.join(here, 'data.csv')) as f:
            reader = csv.reader(f)
            for row in reader:
                brand = row[0].decode('utf-8')
                name = row[1].replace('-', ' ').decode('utf-8')
                url = row[2].decode('utf-8')
                lenses = row[3]
                lens_type = row[4]
                loader = ProductLoader(item=Product(), selector=HtmlXPathSelector())
                loader.add_value('name', name)
                loader.add_value('identifier', url)
                loader.add_value('url', url)
                loader.add_value('brand', brand)
                meta = SpecMeta()
                meta['Lenses'] = lenses
                meta['Lens_type'] = lens_type

                self.log('product url: %s' % url)
                yield Request(url, meta={'m': meta, 'loader': loader}, dont_filter=True)

Beispiel #6

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select('//h1[@class="product-info-head"]/div[1]/text()').extract()
        name = ''.join(name).strip()
        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('name', name)
        loader.add_xpath('price', ".//span[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" inline price bold productInfo-orgPrice product-info-price-current \")]/text()")
        image_url = hxs.select(".//div[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" productPage_image_default \")]/img[1][not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" photo \")]/@src").extract()
        if image_url:
            loader.add_value('image_url', 'http:' + image_url[0])
        loader.add_xpath('brand', ".//dl[not(@id)][not(@class)][not(@style)]/dd[1][not(@id)][not(@class)][not(@style)]/text()")
        category = hxs.select(".//nav[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" breadcrumbs module small \")]/div[2][not(@id)][not(@class)][not(@style)]/a[1][not(@id)][not(@class)][not(@style)]//text()").extract()
        if category:
            category = ''.join(category).strip()
            loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_value('identifier', response.url.split('/')[-1])

        if loader.get_output_value('price'):
            yield loader.load_item()

Beispiel #7

0

Datei anzeigen

Datei: www220volt_ru.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        pd = Selector(response)
        url = response.url
        category = response.meta['category']
        image_url = pd.select('//a[@id="zoom1"]/@href').extract()
        product_identifier = response.xpath('//@data-code').extract()
        product_identifier = response.xpath(
            '//span[@id="product-code"]/span/strong/text()').extract()
        if not product_identifier:
            product_identifier = response.xpath(
                '//span[@id="product-code"]/text()').extract()
        if not product_identifier:
            log.msg(url + " no Code/ID")
        product_identifier = product_identifier[0].strip()
        product_name = pd.select(
            '//h1[@itemprop="name"]/text()').extract()[0].strip()
        brands = response.css('ul.breadcrumbsList li').xpath(
            './/a[contains(@href, "/producer/")]/text()').extract()
        if not brands:
            brands = pd.select(
                '//div[@class="modelContainer"]//li[@class="first"]/a/text()'
            ).extract()
        brand = ''
        if brands:
            brand = brands[0].strip()
        else:
            log.msg(url + " no BRND")

        product_loader = ProductLoader(item=Product(), selector=pd)
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        product_loader.add_value('sku', product_identifier)
        if image_url:
            product_loader.add_value('image_url', image_url[0])
        price = response.xpath('//script/text()').re('product_price":(.+?),')
        if not price:
            price = response.xpath(
                '//span[@id="price_per_m"]/text()').extract()
        price = price[0] if price else 0
        product_loader.add_value('price', price.strip().replace(" ", ""))
        product_loader.add_value('url', url)
        product_loader.add_value('brand', brand)
        product_loader.add_value('category', category)
        product = product_loader.load_item()
        yield product

Beispiel #8

0

Datei anzeigen

Datei: currys.py Projekt: oceancloud82/scraping

 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//div[@class="detailstitle"]/text()')
     loader.add_xpath('identifier',
                      '//script/text()',
                      re="'productID':'(\w+?)'")
     loader.add_xpath('sku', '//script/text()', re="'productID':'(\w+?)'")
     loader.add_value('url', response.url)
     loader.add_xpath('price',
                      '//script/text()',
                      re="'productValue':'([\d\.]+?)'")
     loader.add_xpath('category',
                      '//div[@class="breadcrumb"]/a[position()>1]/text()')
     image_url = response.xpath(
         '//div[@class="mainProductImage"]//img/@src').extract()
     if not image_url:
         image_url = response.xpath(
             '(//div[@class="thumbnail"])[2]//input[@type="image"]/@src'
         ).extract()
         image_url = [image_url[0].replace('XSmall', 'Large')]
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url[0]))
     loader.add_xpath(
         'brand',
         '(//td[contains(h5/text(), "Brand")])[1]/following-sibling::td[1]/span/text()'
     )
     if not response.xpath(
             '//div[@id="availDelTick"]//a[@class="BasketTickOn"]'):
         loader.add_value('stock', 0)
     yield loader.load_item()

Beispiel #9

0

Datei anzeigen

Datei: mancitystore.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        if 'aspxerrorpath' in response.url:
            yield Request(response.request.meta['redirect_urls'][0],
                          self.parse_product,
                          dont_filter=True)
            return
        base_product = True
        add_custom_personalization = False

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_value('category', 'Kits')
        heros_data = response.xpath('//script/text()').re(
            'product\d{7} =(.+?});var')
        base_product_data = response.xpath('//script/text()').re(
            'product\w{6} =(.+?});var')
        if not base_product_data:
            for p in self.parse(response):
                yield p
            return
        if not heros_data:
            data = json.loads(base_product_data[0])
        elif len(heros_data) == 1:
            data = json.loads(heros_data[0])
            base_product = False
        else:
            data = [json.loads(x) for x in heros_data]
            data = {x['ProductID']: x for x in data}
            heros = response.css('select.heroShirts')
            hero = heros.xpath('option[@selected]')
            if not hero:
                data = json.loads(base_product_data[0])
            else:
                data = data[int(hero.xpath('@value').extract_first())]
                base_product = False

        base_product_data = json.loads(base_product_data[0])
        gbp_url = response.xpath(
            '//a[contains(@href, "?cur=GBP")]/@href').extract_first()
        if gbp_url:
            yield Request(response.urljoin(gbp_url),
                          self.parse_product,
                          dont_filter=True)
            return

        # Checking custom personalization
        printings = {
            p['PrintingTypeID']: p
            for p in base_product_data['printingitems']
        }
        custom_printings = printings.get(1)
        if custom_printings and base_product:
            add_custom_personalization = True

        loader.add_value('name', data['Description'])
        loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"')
        if data['Brand']:
            loader.add_value('brand', data['Brand']['Name'])
        loader.add_value('image_url', response.urljoin(data['ImageURL']))
        product = loader.load_item()
        player_from_name = re.search('with *([\w.\- ]+?) *(\d*|TBC) *printing',
                                     data['Description'], re.UNICODE)
        if player_from_name:
            player_name, number = player_from_name.groups()
        #sizes
        for variation in data['Variations']:
            size = variation['Description']
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value(None, product)
            loader.replace_value('identifier', variation['VariationId'])
            loader.add_value('name', size)
            loader.replace_value('price', variation['PriceActual'])
            if variation['PriceActual'] < 75:
                loader.replace_value('shipping_cost', '4.95')
            loader.replace_value('stock', int(variation['IsInStock']))
            item = loader.load_item()
            if player_from_name:
                item['metadata'] = {
                    'player': player_name,
                    'number': number,
                    'size': size
                }
            else:
                item['metadata'] = {'size': size}
            yield item

            # Custom printings
            if add_custom_personalization:
                team_player_name = 'WILLIAMS'
                team_player_number = '10'
                team_player_id = 'WILLIAMS'
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', team_player_name)
                loader.add_value('name', team_player_number)
                price = Decimal(item['price']) + Decimal(
                    str(custom_printings['PriceActual']))
                loader.replace_value('price', price)
                if price >= 75:
                    loader.replace_value('shipping_cost', 0)
                identifier = '-'.join(
                    (item['identifier'], str(custom_printings['PrintingID']),
                     team_player_id))
                loader.replace_value('identifier', identifier)
                custom_item = loader.load_item()
                custom_item['metadata'] = {
                    'player': team_player_name,
                    'number': team_player_number,
                    'size': size
                }
                yield custom_item

                #Badges
            printings = {
                p['PrintingTypeID']: p
                for p in base_product_data['printingitems']
            }
            printing = printings.get(3)
            if printing:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', printing['PrintingDescription'])
                price = variation['PriceActual'] + printing['PriceActual']
                loader.replace_value('price', price)
                if price >= 75:
                    loader.replace_value('shipping_cost', 0)
                identifier = str(variation['VariationId']) + '-' + str(
                    printing['PrintingID'])
                loader.replace_value('identifier', identifier)
                item = loader.load_item()
                if player_from_name:
                    item['metadata'] = {
                        'player': player_name,
                        'number': number,
                        'size': size
                    }
                else:
                    item['metadata'] = {'size': size}
                yield item

Beispiel #10

0

Datei anzeigen

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta.copy()
        categories_urls = response.xpath('//ul[@class="categoryList"]/li//a')
        for category in categories_urls:
            url = category.select('@href').extract()[0]
            name = category.select('text()').extract()[0].strip()
            if "/prl/results" not in url and 'webapp' not in url:
                url += "/prl/results"
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse,
                          meta={'category': name})

        products = response.xpath(
            '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]')
        for product in products:
            try:
                identifier = product.select(
                    './/a[@class="sku"]/text()').extract()[0].strip()
                stock = int(
                    product.select(
                        './/td[@class="availability"]/input[@class="hVal"]/@value'
                    ).extract()[0])
                price = round(
                    Decimal(
                        product.css(
                            '.price input.hVal::attr(value)').extract()[0]), 2)
            except IndexError:
                continue
            if identifier in self.cache_data:
                product_cached = self.cache_data[identifier]
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('identifier', identifier)
                loader.add_value('name',
                                 product_cached['name'].decode('utf-8'))
                loader.add_value('url', product_cached['url'].decode('utf-8'))
                loader.add_value('sku', product_cached['sku'].decode('utf-8'))
                loader.add_value('category',
                                 product_cached['category'].decode('utf-8'))
                loader.add_value('image_url',
                                 product_cached['image_url'].decode('utf-8'))
                loader.add_value('brand',
                                 product_cached['brand'].decode('utf-8'))
                loader.add_value('price', price)
                loader.add_value('stock', stock)
                item = loader.load_item()

                try:
                    self.missing_urls.remove(item['url'])
                except ValueError:
                    pass

                yield item
            else:
                url = product.select(
                    './/a[@class="sku"]/@href').extract()[0].strip()
                url = url_query_cleaner(url)
                if url in self.missing_urls:
                    self.missing_urls.remove(url)
                yield Request(url, callback=self.parse_product, meta=meta)

        pages = response.css('.pages .pageIt a::attr(href)').extract()
        for url in pages:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse,
                          meta=meta)

        if not products and not categories_urls:
            yield Request(url_query_cleaner(response.url),
                          dont_filter=True,
                          callback=self.parse_product,
                          meta=meta)

Beispiel #11

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        options = hxs.select(
            '//div[@class="variation-group"]//a/@href').extract()
        for option in options:
            option_url = urljoin_rfc(base_url, option)
            log.msg('INFO >>> OPTION FOUND: ' + option_url)
            yield Request(option_url, callback=self.parse_product)

        one_seller = hxs.select(
            '//div[@class="marketplace-shipping-message"]//a[@class="bbypopup"]'
        ).extract()
        one_seller = True if one_seller else False

        identifier = hxs.select(
            '//span[@itemprop="productID"]/text()').extract()

        if not identifier:
            request = self.retry(
                response,
                "ERROR >>> No identifier for product URL: " + response.url)
            if request:
                yield request
            return

        identifier = identifier[0]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        price = ''.join(
            hxs.select('//div[@class="item-price"]/text()').extract()).strip()
        loader.add_value('price', price)
        loader.add_xpath('name', '//div[@itemprop="name"]/h1/text()')
        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        categories = response.xpath(
            '//ol[@id="breadcrumb-list"]/li/a/text()').extract()[-3:]
        loader.add_value('category', categories)

        brand = hxs.select(
            '//div[@itemprop="brand"]/meta[@itemprop="name"]/@content'
        ).extract()
        brand = brand[0].strip() if brand else ''
        loader.add_value('brand', 'Surfboard')

        sku = hxs.select('//span[@itemprop="model"]/text()').extract()
        sku = sku[0] if sku else ''
        loader.add_value('sku', sku)

        out_of_stock = hxs.select(
            '//div[@class="cart-button" and @data-button-state-id="SOLD_OUT_ONLINE"]'
        )

        item = loader.load_item()
        item['metadata'] = {'reviews': []}

        reviews_url = 'http://bestbuy.ugc.bazaarvoice.com/3545w/%s/reviews.djs?format=embeddedhtml'
        yield Request(reviews_url % identifier,
                      meta={'product': item},
                      callback=self.parse_review_page)

Beispiel #12

0

Datei anzeigen

Datei: netlens.py Projekt: oceancloud82/scraping

    def parse_products(self, response):
        base_url = get_base_url(response)

        products = response.xpath('//table[@class="productListing"]/tr')
        for p in products:
            loader = ProductLoader(item=Product(), selector=p)
            try:
                url = p.select('.//a/@href').extract()[0]
            except IndexError:
                continue
            name = p.select('.//a[@class="boxtitle"]//text()').extract()[0]
            price = p.select('.//span[@class="boxprice"]/text()').extract()[0]
            image_url = p.select('.//img/@src').extract()[0]
            identifier = re.search('products_id=(\d+)', url).groups()[0]
            loader.add_value('url', url)
            loader.add_value('price', price)
            loader.add_value('name', name)
            loader.add_value('image_url', urljoin(base_url, image_url))
            loader.add_value('category', response.url.split('=')[1])
            loader.add_value('identifier', identifier)
            yield Request(url,
                          meta={'loader': loader},
                          callback=self.parse_brand)

Beispiel #13

0

Datei anzeigen

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        name = ' '.join(response.xpath('//div[@itemprop="name"]/*//text()').extract())
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        image_url = response.xpath('//img[@class="left-image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url[0]))
        price = response.xpath('//div[@itemprop="offers"]/p[@class="box-price"]/b/text()').extract()
        if not price:
            price = response.xpath('//div[@itemprop="offers"]/span[@itemprop="price"]/text()').extract()
        loader.add_value('price', price)
        brand = response.xpath('//img[@class="brand"]/@alt').extract()
        if not brand:
            brand = response.xpath('//div[@itemprop="name"]/h1/text()').extract()
        if brand and not brand[0].isdigit():
            loader.add_value('brand', brand)
        sku = response.xpath('//input[@type="hidden" and @name="productIdAnalytics"]/@value').extract()
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)
        item = loader.load_item()

        metadata = SpecSaversMeta()
        metadata['promotion'] = response.meta['promotional_data']
        item['metadata'] = metadata
        yield item

Beispiel #14

0

Datei anzeigen

Datei: hollandandbarrett.py Projekt: oceancloud82/scraping

 def parse_options(self, response):
     data = json.loads(response.body)
     identifier = url_query_parameter(response.url, 'productId')
     sku = url_query_parameter(response.url, 'skuId')
     loader = ProductLoader(Product(), response=response)
     loader.add_value(None, response.meta['item'])
     loader.replace_value('identifier', '.'.join((identifier, sku)))
     loader.replace_value('sku', sku)
     loader.replace_value('name', data['skuName'])
     if not data['skuName'].endswith(
             data['size']) and not data['skuName'].endswith(
                 data['size'].replace(' ', '')):
         loader.add_value('name', data['size'])
     loader.replace_value('image_url',
                          response.urljoin(data['thumbnail_url']))
     loader.replace_value('price', str(data['unit_sale_price']))
     loader.replace_value('stock', data['stock'])
     if Decimal(data['unit_sale_price']) < 20:
         loader.add_value('shipping_cost', '2.99')
     yield loader.load_item()

Beispiel #15

0

Datei anzeigen

Datei: christmasworld.py Projekt: oceancloud82/scraping

 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(selector=hxs, item=Product())
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//div[@id="BuyBoxArea"]//h1[@itemprop="name"]/text()')
     loader.add_value('identifier', response.meta['id'])
     loader.add_value('sku', response.meta['id'])
     loader.add_xpath('price', '//span[@itemprop="price"]/text()')
     stock = 1 if hxs.select('//span[text()="In Stock"]') else 0
     loader.add_value('stock', stock)
     loader.add_xpath('category', '//div[@class="breadcrumb"]/a[position()>1]/text()')
     loader.add_xpath('brand', '//td[text()="Brand"]/../td[2]/text()')
     loader.add_xpath('image_url', '//img[@class="js-main-image"]/@src')
     product = loader.load_item()
     if product['price'] < 40:
         product['shipping_cost'] = 4.95
     yield product

Beispiel #16

0

Datei anzeigen

Datei: hollandandbarrett.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        for product in self.products.extract_links(response):
            yield Request(product.url, self.parse_product)
        data = response.xpath(
            '//script/text()[contains(., "window.universal_variable")]'
        ).extract_first()
        if not data:
            return
        data = json.loads(re.search('.+?=(.+)', data, re.DOTALL).group(1))
        pdata = data['product']

        loader = ProductLoader(Product(), response=response)
        loader.add_value('identifier', pdata['id'])
        loader.add_value('url', response.urljoin(pdata['url']))
        loader.add_value('name', pdata['name'])
        loader.add_value('name', pdata.get('size'))
        loader.add_value('price', str(pdata['unit_sale_price']))
        loader.add_value('sku', pdata['sku_code'])
        category = response.css('div.crumb').xpath(
            './/span[@itemprop="name"]/text()').extract()[1:-1][-3:]
        loader.add_value('category', category)
        loader.add_value('image_url', response.urljoin(pdata['thumbnail_url']))
        loader.add_value('stock', pdata['stock'])
        item = loader.load_item()

        options_url = 'http://www.hollandandbarrett.com/browse/json/selectSkuForPDP.jsp?skuId=%s&productId=%s'
        skus = response.xpath('//@data-sku-id').extract()
        if len(skus) > 1:
            for sku in skus:
                url = options_url % (sku, pdata['id'])
                yield Request(url,
                              self.parse_options,
                              meta={'item': Product(item)})
            return

        if pdata['unit_sale_price'] < 20:
            item['shipping_cost'] = '2.99'
        yield item

Beispiel #17

0

Datei anzeigen

    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = re.search('\d\d\d\d', response.url).group(0)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()')
        category = response.css('.bread li a::text').extract()[1:]
        category += response.css('.bread li:last-child::text').extract()
        loader.add_value('category', category)
        image_url = response.css('.detimg a::attr(href)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        item = loader.load_item()

        options = response.css('.tbl').xpath('.//*[@class="tr"]')
        if not options:
            item['price'] = 0
            yield item
            return
        for option in options:
            loader = ProductLoader(Product(), selector=option)
            loader.add_value(None, item)
            identifier = option.xpath('.//input/@name').extract_first()
            loader.replace_value('identifier', identifier)
            loader.replace_value('sku', identifier)
            loader.replace_css('price', '.tc-price .pr-now::text')
            loader.add_css('price', '.tc-price::text')
            loader.replace_css('name', '.tc-title::text')
            yield loader.load_item()

Beispiel #18

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            data = json.loads(
                hxs.select('//script[@type="text/javascript"]/text()').re(
                    'var utag_data = ({.+})')[0])
        except IndexError:
            return

        loader = ProductLoader(item=Product(), selector=hxs)
        try:
            loader.add_value('name', data['product_name'])
        except KeyError:
            return
        loader.add_value('identifier', data['product_id'])
        loader.add_value('sku', data['product_id'])
        loader.add_value('brand', data['product_attribute_trademark'])
        loader.add_value('url', urljoin(base_url, data['internal_url']))
        loader.add_value('price',
                         data['product_price'][0] + data['product_taxes'][0])
        categories = hxs.select(
            '//div[@class="breadcrumbs"]//a/text()').extract()[1:]
        loader.add_value('category', categories)
        loader.add_xpath('image_url', '//img[@id="image"]/@src')
        item = loader.load_item()
        if item['price'] < 75:
            item['shipping_cost'] = 7.50
        yield item

        for url in hxs.select(
                '//div[@id="slice_options"]//a/@href[.!="#"]').extract():
            yield Request(urljoin(base_url, url), callback=self.parse_product)
        yield Request(
            'http://www.fashionforhome.de/static/s.php?channel=child&limit=199&single_item_type=k&chunk_type=big&params[product_id]=%s&params[lazy]=1'
            % data.get("parent_product_id", [''])[0],
            callback=self.parse_php)

Beispiel #19

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        price = filter(
            lambda p: p.strip(),
            hxs.select("//span[@class='regular-price']//text()").extract())[1:]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('name', "//div[@class='product-name']//h1//text()")
        loader.add_xpath(
            'category',
            "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()"
        )
        brand = hxs.select(
            "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()"
        ).extract()
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 0)
        loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()')
        loader.add_xpath(
            'identifier',
            "//div[@class='product-view']//input[@name='product']/@value")
        image_urls = hxs.select(
            '//img[contains(@class, "gallery-image")]/@src').extract()
        for image_url in image_urls:
            if len(image_url) < 1024:
                loader.add_value('image_url', image_url)
                break
        product = loader.load_item()
        if product['price'] > 0:
            yield product

Beispiel #20

0

Datei anzeigen

 def parse_products(self, response):
     for url in response.css(
             '.leftoption :contains("Filter by Manufacturers")').xpath(
                 'following-sibling::*//a/@href').extract():
         yield Request(response.urljoin(url), callback=self.parse_products)
     text = re.sub('Estimated *<', 'Estimated &lt;', response.body)
     selector = Selector(text=text)
     category = selector.css('.crumword').xpath(
         './/*[@itemprop="title"]/text()').extract()
     try:
         identifiers = selector.xpath('//script/text()').re(
             'ecomm_prodid: *\[(.+)\]')[0].replace("'", '').split(',')
     except IndexError:
         return
     next_page_url = response.xpath(
         '//div[@class="pagination"]/a[@class="next"]/@href').extract()
     if next_page_url:
         yield Request(response.urljoin(next_page_url[0]),
                       callback=self.parse_products)
     for num, product in enumerate(selector.css('.grid')):
         loader = ProductLoader(item=Product(), selector=product)
         identifier = identifiers[num]
         loader.add_value('identifier', identifier)
         url = product.xpath('@href').extract_first()
         loader.add_value('url', response.urljoin(url))
         name = product.css('.gridname').xpath('text()').extract()
         loader.add_value('name', name)
         price = product.css('.gridPriceVat').xpath('text()').extract()
         if not price:
             price = 0
         loader.add_value('price', price)
         loader.add_value('sku', identifier)
         loader.add_value('category', category)
         image_url = product.css('.gridimage').xpath('.//@src').extract()
         loader.add_value('image_url', image_url)
         if price and loader.get_output_value('price') < 200:
             loader.add_value('shipping_cost', '4.99')
         if 'in stock' not in product.css('.pItemStock').xpath(
                 'text()').extract_first().strip().lower():
             loader.add_value('stock', 0)
         item = loader.load_item()
         if price:
             yield item
         else:
             yield Request(response.urljoin(url),
                           self.parse_product,
                           meta={'product': Product(item)})

Beispiel #21

0

Datei anzeigen

 def parse_product(self, response):
     identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)')
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     category = response.css('.breadcrumb a::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_value('brand', response.meta['brand'])
     loader.add_xpath('image_url', '//div/@data-original-img')
     loader.add_value('identifier', identifier)
     product = loader.load_item()
     if not response.css('.variations'):
         yield product
         return
     
     variations = response.xpath('//form/@data-product_variations').extract_first()
     variations = json.loads(variations)
     for variation in variations:
         variation_loader = ProductLoader(item=Product(product), response=response)
         attributes = variation['attributes'].values()
         variation_loader.replace_value('name', product['name'])
         for attribute in attributes:
             variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute)
         variation_loader.replace_value('price', variation['display_price'])
         variation_loader.replace_value('identifier', variation['variation_id'])
         yield variation_loader.load_item()

Beispiel #22

0

Datei anzeigen

    def parse(self, response):

        response.selector.register_namespace("g",
                                             "http://base.google.com/ns/1.0")

        for item in response.xpath('//item'):
            image_url = item.xpath('g:image_link/text()').extract()
            image_url = image_url[0] if image_url else ''
            category = item.xpath('g:product_type/text()').extract()
            category = category[0].split('>')[1:] if category else ''
            brand = item.xpath('g:brand/text()').extract()
            identifier = item.xpath('g:id/text()').extract()
            name = item.xpath('title/text()').extract_first()
            if name:
                name = name.replace('...', '').strip()
            price = item.xpath('g:price/text()').extract()
            price = extract_price(price[0]) if price else 0
            url = item.xpath('link/text()').extract()[0]
            out_of_stock = item.xpath(
                'g:availability/text()').extract()[0] == 'out of stock'

            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('identifier', identifier)
            product_loader.add_value('sku', identifier)
            product_loader.add_value('name', name)
            product_loader.add_value('image_url', image_url)
            product_loader.add_value('price', price)
            product_loader.add_value('url', url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            if out_of_stock:
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()

            yield product

Beispiel #23

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brand = response.xpath(
            '//span[@itemprop="http://schema.org/manufacturer"]/text()'
        ).extract_first() or response.xpath(
            '//span[@itemprop="http://schema.org/brand"]/text()'
        ).extract_first()

        identifier = hxs.select('//input[@id="itemsArray"]/@value').extract()
        if not identifier:
            return
        sku = response.xpath(
            '//*[@itemprop="mpn"]/text()').extract()[0].strip()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        image_url = response.css(
            'img#productMainImage::attr(src)').extract_first()
        if image_url:
            product_loader.add_value('image_url', response.urljoin(image_url))

        category = response.meta.get('category', '')
        if not category:
            category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()'
                                  ).extract()[-2].strip()

        product_loader.add_value('category', category)

        product_name = response.xpath('//div[@id="product"]//h1//text()').re(
            '\S+')

        product_loader.add_value('name', product_name)
        product_loader.add_xpath('url', 'link[@rel="canonical"]/@href')
        product_loader.add_value('url', response.url)
        product_loader.add_value('identifier', identifier.pop())

        product_loader.add_value('brand', brand)
        product_loader.add_value('sku', sku)
        price = ''.join(
            hxs.select(
                '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()'
            ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '')
        if price:
            price = extract_price(price)
            price = price.quantize(Decimal('.01'))
            product_loader.add_value('price', price)
        else:
            product_loader.add_value('price', 0)

        stock = response.css('span.availability::text').re('\d+')
        if stock:
            product_loader.add_value('stock', stock[0])
        else:
            product_loader.add_value('stock', 0)

        yield product_loader.load_item()

Beispiel #24

0

Datei anzeigen

Datei: doorsofdistinction.py Projekt: oceancloud82/scraping

    def parse_products(self, response):
        try:
            base_url = get_base_url(response)
        except AttributeError:
            return

        if response.xpath(
                '//font[contains(text(), "Recommended Door Treatment")]'
        ) and not self.treatment:
            for treatment in self.parse_treatment(response):
                yield treatment

        identifiers = []
        price_found = False
        for product in response.xpath(
                '//td[@bgcolor="#E5E5E5"]//table/tr[contains(., "Code:")]'
        ) or response.xpath(
                '//td[@bgcolor="#FFFFFF"]//table/tr[contains(., "Code:")]'):
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_value('url', response.url)
            product_name = ' '.join(product.xpath('.//text()').re('\S+'))
            sku = re.findall('Code: *([^:]+)$', product_name)
            if not sku:
                self.log('No SKU found for %s on %s' %
                         (product_name, response.url))
                continue
            sku = sku[0].replace(' ', '')
            loader.add_value('sku', sku)
            item = loader.load_item()
            price = ''
            for size in product.xpath('./following-sibling::tr'):
                if size.xpath('.//*[contains(.//text(), "Code:")]'):
                    break
                if not size.xpath('./td[contains(.//text(), " x")]'):
                    try:
                        price = size.xpath('td[3]//td/text()').extract()[-1]
                    except IndexError:
                        pass
                    continue
                size_name = size.xpath('td[1]//text()').extract()
                if not size_name:
                    continue
                loader = ProductLoader(item=Product(item), selector=size)
                loader.add_value('name', product_name)
                loader.add_value('name', size_name)
                if size.xpath('td[3]'):
                    try:
                        price = size.xpath('td[3]//td/text()').extract()[-1]
                    except IndexError:
                        pass
                if not price:
                    #self.log('No price found for %s %s on %s' %(product_name, size_name, response.url))
                    continue
                price_found = True
                loader.add_value('price', price)
                identifier = sku + '-' + '-'.join(
                    re.findall('\d+', size_name[0]))
                identifier += '-' + response.url.split('/')[-1].split(
                    '_')[0].split('.')[0]
                #avoiding duplicated identifiers
                if identifier in identifiers or identifier in self.ids_seen:
                    identifier += '-d'
                identifiers.append(identifier)
                self.ids_seen.append(identifier)
                loader.add_value('identifier', identifier)
                final_item = loader.load_item()
                image_url = response.xpath(
                    '//*[contains(text(), "Click on")]/../../..//img/@src'
                ).extract() or response.xpath(
                    '//td[@bgcolor="#E5E5E5"]//img/@src').extract()
                for image in response.xpath('//*[@class="doorname"]'):
                    image_name = image.xpath('font/text()').extract()
                    if image_name and image_name[0].strip(
                    ) in final_item['name']:
                        image_url = image.xpath('./../p[2]//img/@src').extract(
                        ) or image.xpath('./../../p[2]//img/@src').extract()
                        if image_url:
                            break
                final_item['image_url'] = urljoin(base_url, image_url[0])
                yield loader.load_item()
        if price_found:
            return

        for url in response.xpath('//a[img]/@href').extract():
            if url.endswith('html'):
                yield Request(urljoin(base_url, url),
                              callback=self.parse_products)
        try:
            product = response.xpath('//td[@class="Pricegridlabel"]')[0]
        except IndexError:
            for product in self.parse_frames(response):
                yield product
            return
        identifiers = []
        name = ' '.join(
            product.xpath('./following-sibling::td[1]//text()').extract())
        image_url = response.xpath(
            '//*[contains(text(), "Click on")]/../preceding-sibling::*[1]//img/@src'
        ).extract() or response.xpath(
            '//img[contains(@alt, "door")]/@src').extract()
        found_sku = False
        for i, option in enumerate(
                product.xpath('./../following-sibling::tr[1]/td')):
            option_name = ' '.join(option.xpath('.//text()').extract())
            code = ''.join(
                option.xpath('./../following-sibling::tr[1]/td[%d]//text()' %
                             (i + 1)).extract())
            sku = ''.join(re.findall('CODE: *([^: ]+)$', code))
            if not sku:
                continue
            found_sku = True
            for size in option.xpath('./../following-sibling::tr'):
                if not size.xpath('./td[1][contains(.//text(), " x")]'):
                    continue
                size_name = size.xpath('td[1]//text()').extract()
                loader = ProductLoader(item=Product(), selector=size)
                loader.add_value('name', (name, option_name))
                loader.add_value('name', size_name)
                loader.add_value('sku', sku)
                identifier = sku + '-' + '-'.join(
                    re.findall('\d+', size_name[0]))
                identifier += '-' + response.url.split('/')[-1].split(
                    '_')[0].split('.')[0]
                #avoiding duplicated identifiers
                while identifier in identifiers or identifier in self.ids_seen:
                    identifier += '-d'
                identifiers.append(identifier)
                self.ids_seen.append(identifier)
                loader.add_value('identifier', identifier)
                loader.add_xpath('price', 'td[%d]//text()' % (i + 2))
                if image_url:
                    loader.add_value('image_url',
                                     urljoin(base_url, image_url[0]))
                loader.add_value('url', response.url)
                yield loader.load_item()
        if not found_sku:
            for product in self.parse_frames(response):
                yield product

Beispiel #25

0

Datei anzeigen

Datei: westwingnow.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        categories = response.xpath(
            '//li[@class="blockBreadcrumb__item"]/a/text()').extract()[-3:]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier', '//input[@name="simpleSku"]/@value')
        loader.add_xpath('sku', '//input[@id="configSku"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[contains(@class, "__heading")]/text()')
        loader.add_xpath('name', '//input[@name="simpleSku"]/../span/text()')
        loader.add_xpath('image_url', '//div[@class="layoutImage"]//img/@src')
        loader.add_xpath('price', '//input[@id="price"]/@value')
        loader.add_xpath('brand', '//input[@id="brand"]/@value')
        loader.add_value('category', categories)
        loader.add_xpath('stock', '//@data-instock')
        item = loader.load_item()

        options = response.xpath('//select[@id="js-simple-selector"]/option')
        if not options:
            if loader.get_output_value('identifier'):
                yield item
            return
        for option in options:
            loader = ProductLoader(item=Product(item), selector=option)
            loader.replace_xpath('identifier', './@value')
            loader.add_xpath('name', './text()')
            identifier = loader.get_output_value('identifier')
            price = response.xpath(
                '//div[@data-simple-sku="%s"]//span[contains(@class, "actualPrice")]/text()'
                % identifier).extract()
            loader.replace_value('price', price)
            image_url = response.xpath(
                '//div[@data-simple-sku="%s"]/a[contains(@class, "link_selected")]/@data-product-image'
                % identifier).extract()
            loader.replace_value('image_url', image_url)
            loader.replace_xpath('stock', './@data-instock')
            yield loader.load_item()

Beispiel #26

0

Datei anzeigen

    def parse(self, response):
        reader = csv.DictReader(StringIO(response.body))
        for row in reader:
            identifier = row.get('ID', None)
            brand = row['Supplier'].decode('utf-8')
            name = row['lens-name'].decode('utf-8')

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            loader.add_value('brand', brand)
            loader.add_value('category', brand)
            loader.add_value('price', row[self.price_field])
            p = loader.load_item()

            yield p

Beispiel #27

0

Datei anzeigen

Datei: powertoolworld_spider.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        name = response.css('.product-name').xpath('h1/text()').extract_first()
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        sname = name.lower()
        for brand in self.brands:
            if sname.startswith(brand):
                loader.add_value('brand', brand.title())
                break
        categories = response.css('.breadcrumbs').xpath(
            './/a/span/text()').extract()[1:]
        loader.add_value('category', categories)
        sku = hxs.select(
            '//*[@id="product_addtocart_form"]//div[@class="expert-notes "]//span[contains(text(), "SKU: ")]/text()'
        ).extract()
        if sku:
            sku = sku[0].replace("SKU: ", '')
        else:
            sku = ''
        loader.add_value('sku', sku)
        identifier = hxs.select('//input[@name="product"]/@value').extract()[0]
        loader.add_value('identifier', identifier + '-new')
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        price = response.xpath('//script/text()').re('price":"(.+?)"')
        price = extract_price(price[0]) if price else 0

        loader.add_value('price', price)

        in_stock = hxs.select(
            '//div[@class="availability in-stock"]//div[@class="value" and contains(text(), "In stock")]'
        )
        if not in_stock:
            in_stock = hxs.select(
                '//p[@class="availability back-order"]//span[@class="value" and contains(text(), "Back Order")]'
            )

        if not in_stock:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') < 100:
            loader.add_value('shipping_cost', 6.50)

        item = loader.load_item()

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['price'])

            base_price = extract_price(product_data['basePrice'])
            for option_identifier, option_name in products.iteritems():
                option_item = deepcopy(item)

                option_item['identifier'] += '-' + option_identifier
                option_item['name'] += option_name
                option_item['price'] = base_price + prices[option_identifier]
                yield option_item
        else:
            yield item

Beispiel #28

0

Datei anzeigen

Datei: foxsoccershop.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        identifier = response.xpath("//div[@class='item-number']/text()").extract_first()
        sku = identifier
        identifier = re.sub(u'a', u'', identifier, flags=re.IGNORECASE)
        name = response.xpath("//div[@class='product-title']/h1/text()").extract_first()
        price = response.xpath("//div[@class='price']//span[@class='disc-price']/text()").extract()
        if not price:
            price = response.xpath("//div[@class='price']/div[@class='regular-price']/span[@class]/text()").extract()
        if price:
            price = price[0].strip('$').replace(",", "")
        else:
            price = '0.00'
        price = Decimal(price)
        # convert using xe.com
        price = price * self.exchange_rate
        image_url = response.xpath("//a[@id='mainImage']/img/@src").extract_first()
        categories = response.xpath('//div[@id="breadcrumbs-"]/ul/li/a//text()')[1:-1].extract()
        try:
            brand = response.xpath('//b[contains(., "BRAND:")]/following-sibling::text()[1]').extract_first().title()
        except AttributeError:
            brand = ''

        attributes = response.xpath('//fieldset[@class="attributes"]//li')
        options = []
        option_names = {}
        for option in response.xpath('//select[@name="attrValue_1"]/option[@value!=""]'):
            opt_val = option.xpath('./@value').extract()
            opt_name = option.xpath('./span/text()').extract()
            if opt_val and opt_name:
                option_names[opt_val[0]] = opt_name[0]
        for attr in attributes:
            attr_name = attr.xpath('.//input[@name="attrName_1"]/@value').extract()
            if attr_name:
                attr_name = attr_name[0]
            else:
                continue
            attr_options = []
            attr_values = attr.xpath('.//select/option[@value!=""]/@value').extract()
            for attr_value in attr_values:
                attr_options.append((attr_name, attr_value))
            if not attr_values:
                attr_value = attr.xpath('.//input[@name="attrValue_1"]/@value')[0].extract()
                attr_options.append((attr_name, attr_value))
            if attr_options:
                options.append(attr_options)
        options = itertools.product(*options)
        items = []
        for option in options:
            opt = [option_names.get(v, '') for _, v in option]
            opt = [o for o in opt if o]
            option_name = ' '.join(opt).strip()
            opt = [SIZES_DICT.get(o.lower(), o) for o in opt]
            option_id = ':'.join(opt).strip()

            option_name = re.sub('size', '', option_name, flags=re.IGNORECASE).strip()
            size = option_names.get(option[-1][-1], '') if option and option[-1] else ''
            size = re.sub('size', '', size, flags=re.IGNORECASE).strip()
            if option_name:
                product_name = name + ' (' + option_name + ')'
            else:
                product_name = name
            if option_id:
                product_identifier = identifier + u':' + option_id.strip().lower()
            else:
                product_identifier = identifier

            loader = ProductLoader(Product(), option)
            loader.add_value('name', product_name)
            loader.add_value('url', response.url)
            loader.add_value('identifier', product_identifier)
            loader.add_value('sku', sku)
            loader.add_value('price', price)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            for category in categories:
                loader.add_value('category', category)

            product = loader.load_item()
            product['metadata'] = {'size': size}

            player = [p for p in self.players if p[1].lower() in product_name.lower()]
            if player:
                product['metadata']['player'] = player[0][1].title()
                product['metadata']['number'] = player[0][2]

            if len(self.shipping_requests) < 5:
                self.make_shipping_request(response)
            item = {'item': product}
            item['attributes'] = ()
            for k, v in option:
                item['attributes'] += ((k, v),)
            items.append(item)

        if not options:
            loader = ProductLoader(Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('url', response.url)
            loader.add_value('identifier', identifier)
            loader.add_value('sku', sku)
            loader.add_value('price', price)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            for category in categories:
                loader.add_value('category', category)

            product = loader.load_item()
            product['metadata'] = {}
            player = [p for p in self.players if p[1].lower() in name.lower()]
            if player:
                product['metadata']['player'] = player[0][1].title()
                product['metadata']['number'] = player[0][2]

            if len(self.shipping_requests) < 5:
                self.make_shipping_request(response)
            item = {'item': product}
            item['attributes'] = ()
            item['attributes'] += ((response.xpath('//input[@name="attrName_1"]/@value')[0].extract(),
                                    response.xpath('//input[@name="attrValue_1"]/@value')[0].extract()),)
            item['attributes'] += ((response.xpath('//input[@name="attrName_1"]/@value')[1].extract(),
                                    response.xpath('//input[@name="attrValue_1"]/@value')[1].extract()),)
            items.append(item)
        product_id = response.xpath('//input[@name="productId"]/@value')[0].extract()
        yield Request('http://www.foxsoccershop.com/InventoryCheck.json?productId={}'.format(product_id),
                      meta={'items': items},
                      callback=self.parse_stock)

Beispiel #29

0

Datei anzeigen

    def parse_product(self, response):
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = ''.join(
            response.xpath(
                '//h1[@class="PrpdocutName"]//text()').extract()).strip()
        product_loader.add_value('name', name)
        brand = response.xpath(
            '//span[@class="parent_product_manufacture_logo"]/img/@alt'
        ).extract()
        brand = brand[0].strip() if brand else ''
        product_loader.add_value('brand', brand)
        identifier = response.xpath(
            '//input[@name="products_id"]/@value').extract()
        if not identifier:
            identifier = re.findall('custom_product_id=(\d+)', response.body)
        product_loader.add_value('identifier', identifier[0])
        product_loader.add_value('sku', identifier[0])
        category = response.xpath(
            '//div[@class="breadcrumb"]//span[@itemprop="title"]/text()'
        ).extract()[1:-1]
        product_loader.add_value('category', category)

        image_url = response.xpath(
            '//span[@class="image_container"]/img/@src').extract()
        if image_url:
            image_url = response.urljoin(image_url[0])
            product_loader.add_value('image_url', image_url)

        product = product_loader.load_item()

        options = response.xpath(
            '//table[@id="product_price_list"]//tr[not(contains(@class, "HeadingRow"))]'
        )
        if options:
            for option in options:
                prod = Product(product)
                product_loader = ProductLoader(item=prod, response=response)
                option_name = option.xpath(
                    'td/div[@class="subproduct_name"]/text()').extract()
                if option_name:
                    option_name = name + ' ' + option_name[0].strip()
                    product_loader.add_value('name', option_name)
                identifier = option.xpath(
                    './/input[@name="sub_products_id[]"]/@value').extract()
                if not identifier:
                    identifier = option.xpath(
                        './/input[@name="email_me_products_id"]/@value'
                    ).extract()
                if not identifier:
                    identifier = option.xpath(
                        './/input[@name="products_id"]/@value').extract()

                if identifier:
                    product_loader.add_value(
                        'identifier',
                        product['identifier'] + '-' + identifier[0])
                else:
                    log.msg(' >>>>>> Possible wrong identifier: ' +
                            response.url)

                sku = product_loader.get_output_value('identifier')
                product_loader.add_value('sku', sku)
                price = option.xpath(
                    './/span[@class="productSpecialPrice"]/text()').extract()
                if not price:
                    price = option.xpath(
                        './/span[@class="listing-price"]/text()').extract()
                price = price[0] if price else 0
                product_loader.add_value('price', price)
                in_stock = option.xpath(
                    './/span[@class="instock" and text()="In Stock"]').extract(
                    )
                if not in_stock or not product_loader.get_output_value(
                        'price'):
                    product_loader.add_value('stock', 0)
                if product_loader.get_output_value('price') < 70:
                    product_loader.add_value('shipping_cost', Decimal('9.90'))
                yield product_loader.load_item()
        else:
            log.msg(' >>>>> ERROR: NO OPTIONS' + response.url)
            #if product['price'] < 70:
            #    product['shipping_cost'] = Decimal('9.90')
            '''

Beispiel #30

0

Datei anzeigen

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('sku', '//script/@data-flix-sku')
        shipping_cost = self.shipping_costs.get(loader.get_output_value('sku'),
                                                None)
        if shipping_cost:
            loader.add_value('shipping_cost', extract_price(shipping_cost))

        loader.add_xpath('identifier',
                         '//input[contains(@id, "SKUID")]/@value')
        name = response.xpath('//h1/text()').extract() or response.xpath(
            '//h2[@itemprop="name"]/text()').extract()
        if not name:
            return
        name = name[0]
        loader.add_value('name', name)
        loader.add_xpath('price', '//span[@class="TotalPrice"]/text()')
        categories = response.xpath(
            '//a[@class="CMSBreadCrumbsLink"]/text()').extract()
        if not categories:
            categories = ''
        loader.add_value('category', categories)
        for brand in hxs.select(
                '//div[@title="Brand"]/following-sibling::div//span/@title'
        ).extract():
            if name.title().startswith(brand.title()):
                break
        else:
            brand = ''
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 19.99)
        if 'In stock' not in hxs.select(
                '//span[@class="stock available"]/text()').extract():
            loader.add_value('stock', 0)

        product = loader.load_item()
        self.products[product['sku']].append(product)