def parse_book(self, response): l = ItemLoader(item=BooksCrawlerItem(), response=response) l.add_value( 'Title', response.xpath('//*[@class="col-sm-6 product_main"]/h1/text()'). extract_first()) return l.load_item()
def parse_book(self, response): items = BooksCrawlerItem() title = response.css('h1::text').extract_first() url = response.request.url items['title'] = title items['url'] = url yield items
def parse_book(self, response): l = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.css('h1::text').extract_first() price = response.xpath( '//*[@class="price_color"]/text()').extract_first() image_urls = response.xpath('//img/@src').extract_first() image_urls = image_urls.replace('../..', 'http://books.toscrape.com') l.add_value('title', title) l.add_value('price', price) l.add_value('image_urls', image_urls) return l.load_item()
def parse_book(self, response): item_loader = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.xpath('//h1/text()').extract_first() url = response.request.url yield { 'Title': title, 'Url': url } item_loader.add_value('Title', title) item_loader.add_value('Url', url) item_loader.load_item()
def parse_book(self, response): """ getting the books information """ l = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.css('h1::text').extract_first() price_color = response.xpath( '//h1/following-sibling::p[@class="price_color"]/text()').extract_first() image_urls = response.xpath('//img/@src').extract_first() image_urls = image_urls.replace('../..', 'http://books.toscrape.com') l.add_value('title', title) l.add_value('price', price_color) l.add_value('image_urls', image_urls) return l.load_item()
def parse_book(self, response): l = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.css("h1::text").extract_first() price = response.xpath( "//*[@class='price_color']/text()").extract_first() image_url = response.xpath('//img/@src').extract_first() image_url = image_url.replace('../..', 'http://books.toscrape.com/') rating = response.xpath( "//*[contains(@class,'star-rating')]/@class").extract_first() rating = rating.replace("star-rating", '') description = response.xpath( "//*[@id='product_description']/following-sibling::p/text()" ).extract_first() upc = product_info(response, 'UPC') product_type = product_info(response, 'Product Type') price_without_tax = product_info(response, 'Price (excl. tax)') price_with_tax = product_info(response, 'Price (incl. tax)') tax = product_info(response, 'Tax') availability = product_info(response, 'Availability') number_of_reviews = product_info(response, 'Number of reviews') url = response.request.url l.add_value('title', title) l.add_value('price', price) l.add_value('image_urls', image_url) yield l.load_item() yield { 'title': title, 'price': price, 'rating': rating, 'image_url': image_url, 'description': description, 'upc': upc, 'product_type': product_type, 'price_without_tax': price_without_tax, 'price_with_tax': price_with_tax, 'tax': tax, 'availability': availability, 'number_of_reviews': number_of_reviews, 'url': url }
def parse_book(self, response): l = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.css('h1::text').extract_first() price = response.xpath( '//*[@class="price_color"]/text()').extract_first() image_urls = response.xpath('//img/@src').extract_first() image_urls = image_urls.replace('../..', 'http://books.toscrape.com/') l.add_value('image_urls', image_urls) yield l.load_item() rating = response.xpath( '//*[contains(@class, "star-rating")]/@class').extract_first() rating = rating.replace('star-rating', '') description = response.xpath( '//*[@id="product_description"]/following-sibling::p/text()' ).extract_first() # product informatoion data points upc = product_info(response, 'UPC') product_type = product_info(response, 'Product Type') price_including_tax = product_info(response, 'Price (excl. tax)') price_excluding_tax = product_info(response, 'Price (incl. tax)') tax = product_info(response, 'Tax') availability = product_info(response, 'Availability') number_of_reviews = product_info(response, 'Number of reviews') yield { 'title': title, 'price': price, # 'image_urls': image_urls, 'rating': rating, 'description': description, 'upc': upc, 'product_type': product_type, 'price_including_tax': price_including_tax, 'price_excluding_tax': price_excluding_tax, 'tax': tax, 'availability': availability, 'number_of_reviews': number_of_reviews }
def parse_book(self, response): page_url = response.url title = response.css('h1::text').extract_first() price = response.xpath('//*[@class="price_color"]/text()').extract_first() image_url = response.xpath('//img/@src').extract_first() image_url = image_url.replace('../..', 'http://books.toscrape.com/') rating = response.xpath('//*[contains(@class, "star-rating")]/@class').extract_first() rating = rating.replace('star-rating ', '') description = response.xpath( '//*[@id="product_description"]/following-sibling::p/text()').extract_first() # response.url # product information data points upc = product_info(response, 'UPC') product_type = product_info(response, 'Product Type') price_without_tax = product_info(response, 'Price (excl. tax)') price_with_tax = product_info(response, 'Price (incl. tax)') tax = product_info(response, 'Tax') availability = product_info(response, 'Availability') number_of_reviews = product_info(response, 'Number of reviews') # item = BooksCrawlerItem() item['page_url'] = page_url item['title'] =title item['price'] =price item['image_url'] =image_url item['rating'] =rating # item['description'] =description item['upc'] =upc item['product_type'] =product_type item['price_without_tax'] =price_without_tax item['price_with_tax'] =price_with_tax item['tax'] =tax item['availability'] =availability item['number_of_reviews'] =number_of_reviews return item
def parse_book(self, response): ''' Scrapes a site and downloads images for all the books and renames the images to the title. ''' l = ItemLoader(item=BooksCrawlerItem(), response=response) title = response.xpath('//h1/text()').extract_first() price = response.xpath( '//*[@class="price_color"]/text()').extract_first() image_urls = response.xpath('//img/@src').extract_first() image_urls = image_urls.replace('../..', 'http://books.toscrape.com') rating = response.xpath( '//*[contains(@class, "star-rating")]/@class').extract_first() rating = rating.replace('star-rating ', '') description = response.xpath( '//*[@id="product_description"]/following-sibling::p/text()' ).extract_first() description = description.replace(' ...more', '') # Product information table upc = product_table(response, 'UPC') price_excl = product_table(response, 'Price (excl. tax)') price_incl = product_table(response, 'Price (incl. tax)') tax = product_table(response, 'Tax') p_type = product_table(response, 'Product Type') stock = product_table(response, 'Availability') reviews = product_table(response, 'Number of reviews') l.add_value('title', title) l.add_value('price', price) l.add_value('image_urls', image_urls) l.add_value('rating', rating) l.add_value('description', description) l.add_value('upc', upc) l.add_value('price_excl', price_excl) l.add_value('price_incl', price_incl) l.add_value('tax', tax) l.add_value('p_type', p_type) l.add_value('stock', stock) l.add_value('reviews', reviews) return l.load_item()