def parse_item(self, response): """ @url http://www.fahasa.com/luat-im-lang-mario-puzo.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_value('name', l.get_xpath('//*[@class="product-name"]/h1/text()')[-1]) l.add_value( 'name_unidecode', unidecode(l.get_xpath('//*[@class="product-name"]/h1/text()')[-1])) l.add_value('price', l.get_xpath('//*[@class="price"]/text()')[1].strip(), TakeFirst(), re=r'\d+\.\d+') l.add_value( 'description', filter(None, [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@class="std"]') ]), Join('\n')) l.add_xpath('image_uri', '//*[@id="image"]/@src') # Information fields l.add_value('url', response.url[response.url.find('cache:') + 6:]) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1]) l.add_value( 'name_unidecode', unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1])) l.add_xpath('price', '//*[contains(@id, "discounted_price")]/span/text()', TakeFirst()) l.add_xpath('author', '//*[@itemprop="author"]/text()') l.add_value( 'description', filter(None, [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@class="full-description"]/p') ]), Join('\n')) l.add_xpath('image_uri', '//*[@itemprop="image"]/@src') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url https://tiki.vn/hieu-ng-canh-buom-p146105.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_xpath('name', '//*[@class="item-name"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('name_unidecode', '//*[@class="item-name"]/text()', MapCompose(unidecode, str.strip, str.title)) l.add_xpath('author', '//*[@class="item-brand"]/p/a/text()') l.add_xpath('price', '//*[@id="span-price"]/text()', TakeFirst(), re=r'\d+\.\d+') l.add_value('description', [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@id="gioi-thieu"]/p') ], Join('\n')) l.add_xpath('image_uri', '//*[@itemprop="image"]/@src') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url http://www.lazada.vn/tony-buoi-sang-tren-duong-bang-1540897.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_xpath('name', '//*[@id="prod_title"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('name_unidecode', '//*[@id="prod_title"]/text()', MapCompose(unidecode, str.strip, str.title)) l.add_xpath('price', '//*[@id="special_price_box"]/text()') l.add_value( 'description', re.sub('<[^<]+?>', '', l.get_xpath('//*[@class="product-description__block"]') [0]).strip()) l.add_value('image_uri', l.get_xpath('//*[@itemprop="image"]/@content')[1]) # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()