def parse(self, response): date_selector = response.css('div.date') info_loader = ItemLoader(item=ComercioNew(), selector=date_selector) info_loader.default_output_processor = TakeFirst() info_loader.add_css('Date', 'div::text') title_selector = response.css('div.title') info_loader.selector = title_selector info_loader.add_css('Title', 'h1::text') views_selector = response.css('div.social-nav') info_loader.selector = views_selector info_loader.add_css('Views', 'div.pageviews::text') reactions_selector = response.css('div.rating>div.score') reactions_names = [ 'Indignado', 'Triste', 'Indiferente', 'Sorprendido', 'Contento' ] for i in range(0, 5): info_loader.selector = reactions_selector[i] info_loader.add_css(reactions_names[i], '.number::text') editor_selector = response.css('div.right-col>div.info') info_loader.selector = editor_selector info_loader.add_css('Editor', 'div.signature>div::text') info_selector = response.css('div.breadcrumbs') info_loader.selector = info_selector info_loader.add_css('Category', 'a::text') info_loader.add_css('Tag', 'a.highlighted::text') yield info_loader.load_item()
def parse_product(self, response): selector = response.selector.xpath( '//section[@id="main"]/div[@class="row"]') loader = ItemLoader(item=response.meta["item"], selector=selector) loader.add_xpath('detail_name', './/h4[@class="name_detail"]/text()') loader.add_xpath('brand', './/div[@class="product_manufacturer->name"]/text()') loader.add_xpath( 'description', './/div[@class="product-description-short-detail" and ' '@itemprop="description"]/p/descendant-or-self::*/text() ') _loader = loader.nested_xpath('//select[@id="group_1"]/option') _loader.add_xpath('size_format', './/text()') # loader.add_xpath('price', './/span[@itemprop="price"]/text()') loader.selector = response.selector.xpath( '//div[@class="tabs"]/div[@class="tab-content" and @id="tab-content"]' ) loader.add_xpath( 'detail_description', './/div[@class="elementor-accordion-content elementor-clearfix" and ' '@data-section="1"]/ol/descendant-or-self::*/text()') loader.add_xpath( 'detail_ingredients', './/div[@class="elementor-accordion-content elementor-clearfix" and @data-section="2"]/p/text()' ) loader.add_xpath( 'nutritional_facts', './/div[@class="elementor-accordion-content elementor-clearfix" and ' '@data-section="3"]/descendant-or-self::*/text()') # loader.add_xpath('nutritional_facts_img_url', './/*[@id="collapseThree"]/div/p/img/@src') loader.add_xpath( 'feed_guide', './/div[@class="elementor-accordion-content elementor-clearfix" and ' '@data-section="4"]/p/descendant-or-self::*/text()') loader.add_xpath( 'feed_guide_img_url', './/div[@class="elementor-accordion-content elementor-clearfix" and @data-section="4"]//img/@src' ) loader.add_xpath( 'extra_information_keys', './/dl[@class="data-sheet"]/dt[@class="name"]/text()') loader.add_xpath( 'extra_information_values', './/dl[@class="data-sheet"]/dd[@class="value"]/text()') self.log(f'finished parsing product page {response.url}') return loader.load_item()
def parse_product(self, response): selector = response.selector.xpath('//section[@class="row"]') loader = ItemLoader(item=response.meta["item"], selector=selector) # meta_loader = ItemLoader(item=ProductItemMeta(), selector=selector) loader.add_xpath('detail_name', './/h1[@itemprop="name"]/text()') loader.add_xpath('brand', './/h5[@itemprop="brand"]/text()') loader.add_xpath( 'description', './/div[@class="col-xs-12 col-sm-12 col-md-12 col-lg-12"]/p/text()' ) _loader = loader.nested_xpath('//select[@id="__sku"]/option') _loader.add_xpath('price', './/@data-priceformat') _loader.add_xpath('size_format', './/text()') loader.selector = response.selector.xpath( '//div[@id="accordion"]/div[@class="panel panel-default"]') loader.add_xpath( 'detail_description', './/div[@id="collapseOne"]/div/descendant-or-self::*/text()') loader.add_xpath( 'detail_ingredients', './/div[@id="collapseTwo"]/div/descendant-or-self::*/text()') loader.add_xpath( 'nutritional_facts', './/div[@id="collapseThree"]/div/descendant-or-self::*/text()') loader.add_xpath('nutritional_facts_img_url', './/*[@id="collapseThree"]/div/p/img/@src') loader.selector = response.selector.xpath( '//*[@id="review"]/div/div/div') loader.add_xpath('customer_review_header', './/h3[@class="panel-title"]/text()') ratings = [] for _ in loader.selector: rating = ''.join(_.xpath('.//label/text()').getall()) ratings.append(rating) loader.add_value('customer_review_rating', ratings) loader.add_xpath( 'customer_review', './/blockquote[@class="blockquote-reverse"]/p/text()') self.log(f'finished parsing product page {response.url}') return loader.load_item()
def parse_product_item_meta(self, response): selector = response.selector.xpath('//*[@id="collapseOne"]/') loader = ItemLoader(item=response.meta["item"], selector=selector) loader.selector = response.selector.xpath('//*[@id="collapseOne"]/') loader.add_xpath('detail_description', './/div/descendant-or-self::*/text()') self.log('>>>>>>>>>>>>>>>>>>> BEGIN >>>>>>>>>>>>>>>>>>>') self.log(loader.load_item()) self.log('>>>>>>>>>>>>>>>>>>> END >>>>>>>>>>>>>>>>>>>') self.log('parsing product item meta') return loader.load_item()
def parse_category(self, response): next_page = response.css( 'link[rel="next"]::attr(href)').extract_first() if next_page: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse_category) for item in response.css( 'div[class^="product-grid-item grid__item xlarge-up--one-quarter one-half"]' ): l = ItemLoader(item=Product(), response=response) l.add_value('category', self.get_category(response)) if text_fields := item.css('a > div[class="product-text"]'): l.selector = text_fields l.add_css('name', 'p.title *::text') l.add_css('brand', 'h2 *::text') if product_on_sale := item.css('p[class*="sale"]'): l.selector = product_on_sale l.add_css('price', 's *::text') l.add_css('sale_price', 'span[itemprop="price"] *::text') else: l.add_css('price', 'p.price > span *::text')
def parse(self, response): response.selector.remove_namespaces() articles = response.xpath(xpath.ARTICLE_ITEM) for article in articles: article_loader = ItemLoader(item=ArticleItem(), response=response) article_loader.selector = article article_loader.add_xpath('title', xpath.ARTICLE_TITLE) article_link = article.xpath(xpath.ARTICLE_LINK).extract_first() article_loader.add_value('link', article_link) article_loader.add_xpath('description', xpath.ARTICLE_DESCRIPTION) article_author = article.xpath(xpath.ARTICLE_AUTHOR).extract_first() if article_author is None: article_author = article.xpath(xpath.ARTICLE_AUTHOR_ALTERNATIVE).extract_first() article_loader.add_value('author', article_author) article_loader.add_xpath('publication_date', xpath.ARTICLE_PUBLICATION_DATE) article_categories = " | ".join(article.xpath(xpath.ARTICLE_CATEGORIES).getall()) article_loader.add_value('categories', article_categories) article_loader.add_xpath('image_url', xpath.ARTICLE_IMAGE_URL) article_loader.add_xpath('credit', xpath.ARTICLE_CREDIT) article_loader.add_xpath('guid', xpath.ARTICLE_GUID) article_request = self._create_article_request(article_link, article_loader) yield article_request