def parse_item(self,response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()') price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text') sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text') """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text") il.add_value("url",response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice",sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice",sale) return il.load_item()
def parse_product(self, response: HtmlResponse): loader = ItemLoader(item=LeroyItem(), response=response) loader.add_css('name', 'h1.header-2::text') loader.add_value('url', response.url) loader.add_css( 'price', 'uc-pdp-price-view.primary-price meta[itemprop="price"]::attr(content)' ) loader.add_css('photos', 'picture[slot="pictures"] img::attr(data-origin)') options_keys = loader.get_css('dt.def-list__term::text', MapCompose(str.strip)) options_val = loader.get_css('dd.def-list__definition::text', MapCompose(str.strip)) loader.add_value('options', dict(zip(options_keys, options_val))) yield loader.load_item()
def parse_detail(self, response, char): loader = ItemLoader(item=char, response=response) loader.add_value("url", response.url) loader.add_css("image", selectors["CHARACTER_IMAGE"]) loader.add_css("name", selectors["CHARACTER_NAME"]) loader.add_css("feature_films", selectors["CHARACTER_FEATURE_FILMS"]) loader.add_css("short_films", selectors["CHARACTER_SHORT_FILMS"]) loader.add_css("shows", selectors["CHARACTER_SHOWS"]) loader.add_css("games", selectors["CHARACTER_GAMES"]) loader.add_css("rides", selectors["CHARACTER_RIDES"]) loader.add_css("animator", selectors["CHARACTER_ANIMATOR"]) loader.add_css("designer", selectors["CHARACTER_DESIGNER"]) loader.add_css("voice", selectors["CHARACTER_VOICE"]) loader.add_css("portrayed_by", selectors["CHARACTER_PORTRAYED_BY"]) loader.add_css("performance_model", selectors["CHARACTER_PERFORMANCE_MODEL"]) loader.add_css("inspiration", selectors["CHARACTER_INSPIRATION"]) loader.add_css("awards", selectors["CHARACTER_AWARDS"]) loader.add_css("fullname", selectors["CHARACTER_FULLNAME"]) loader.add_css("other_names", selectors["CHARACTER_OTHER_NAMES"]) loader.add_css("occupation", selectors["CHARACTER_OCCUPATION"]) loader.add_css("affiliations", selectors["CHARACTER_AFFILIATIONS"]) loader.add_css("home", selectors["CHARACTER_HOME"]) loader.add_css("likes", selectors["CHARACTER_LIKES"]) loader.add_css("dislikes", selectors["CHARACTER_DISLIKES"]) loader.add_css("powers", selectors["CHARACTER_POWERS"]) loader.add_css("paraphernalia", selectors["CHARACTER_PARAPHERNALIA"]) loader.add_css("status", selectors["CHARACTER_STATUS"]) loader.add_css("parents", selectors["CHARACTER_PARENTS"]) loader.add_css("siblings", selectors["CHARACTER_SIBLINGS"]) loader.add_css("family", selectors["CHARACTER_FAMILY"]) loader.add_css("partner", selectors["CHARACTER_PARTNER"]) loader.add_css("children", selectors["CHARACTER_CHILDREN"]) loader.add_css("pets", selectors["CHARACTER_PETS"]) if len(loader.get_css(selectors["CHARACTER_NAME"])) < 1: loader.add_css("name", selectors["PAGE_HEADER_TITLE"]) if len(loader.get_css(selectors["CHARACTER_IMAGE"])) < 1: loader.add_css("image", selectors["CHARACTER_THUMB_IMAGE"]) logging.info("Crawl %s" % loader.get_collected_values("name")) char = loader.load_item() yield char
def parse_item(self, response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath( '//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath( '//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()' ) price = il.get_css( 'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text' ) sale = il.get_css( 'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text' ) """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css( "title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text" ) il.add_value("url", response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice", sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice", sale) return il.load_item()
def parse_image_url(self, response): image_loader = ItemLoader(response=response) link = image_loader.get_css( 'div.main > section.section > div.container > div > div > div > img' )[0] link_selector = Selector(text=link, type="xml") sign = response.meta['current_item'] link_loader = ItemLoader(item=sign, selector=link_selector) link_loader.add_xpath('image_url', '@src') sign = link_loader.load_item() return sign
def parse(self, response): category = response.meta['current_category'] link_loader = ItemLoader(response=response) links = link_loader.get_css( 'div.main > section.section > div.container > div > div > div > a') for link in links: link_selector = Selector(text=link, type="xml") link_loader = ItemLoader(item=Sign(), selector=link_selector) link_loader.add_value('category', category) link_loader.add_xpath('detail_url', '@href') link_loader.add_xpath('meaning', '@title') link_loader.add_xpath('miniature_url', 'img/@src') sign = link_loader.load_item() yield scrapy.Request(url=sign['detail_url'], callback=self.parse_image_url, meta={'current_item': sign})
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') __________________________________________________________ 'add_xpath(field_name, xpath, *processors, **kwargs)' - Giống add_value, nó nhận 1 biểu thức Xpath thay cho 1 giá trị, Biểu thức Xpath này dùng để trích xuất ra 1 list các chuỗi unicode - Vd: # HTML snippet: <p class="product-name">Color TV</p> loader.add_xpath('name', '//p[@class="product-name"]') # HTML snippet: <p id="price">the price is $1200</p> loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') __________________________________________________________ 'replace_xpath(field_name, xpath, *processors, **kwargs)' - Giống add_xpath(), nó thay thế dữ liệu cũ thay vì thêm dữ liệu mới vào nó. __________________________________________________________ 'get_css(css, *processors, **kwargs)' - Tương tự get_xpath, nó thay thế biểu thức xpath bằng Css selector, và cũng trích xuất ra 1 list các chuỗi unicode - Vd : # HTML snippet: <p class="product-name">Color TV</p> loader.get_css('p.product-name') # HTML snippet: <p id="price">the price is $1200</p> loader.get_css('p#price', TakeFirst(), re='the price is (.*)') __________________________________________________________ 'add_css(field_name, css, *processors, **kwargs)' - Giống add_value, nó nhận 1 biểu thức Xpath thay cho 1 giá trị, Biểu thức Xpath này dùng để trích xuất ra 1 list các chuỗi unicode - Vd: # HTML snippet: <p class="product-name">Color TV</p> loader.add_css('name', 'p.product-name') # HTML snippet: <p id="price">the price is $1200</p> loader.add_css('price', 'p#price', re='the price is (.*)') __________________________________________________________ 'replace_css(field_name, css, *processors, **kwargs)' - Giống add_css(), nó thay thế dữ liệu cũ thay vì thêm dữ liệu mới tới nó. __________________________________________________________