def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url='{}/cms/'.format(self._link), timezone=self._timezone, remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr', 'h7'], remove_elems_xpath=['//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]'], ) il.add_value( 'title', response.xpath('//head/title/text()').re_first(r'::: (.*)')) il.add_value('link', response.url) il.add_value( 'updated', response.xpath('//div[@class="news-single-rightbox"]'). re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_value( 'author_name', response.xpath('//head/meta[@name="publisher"]/@content'). re_first('recht.at, (.*);')) il.add_xpath('author_name', '//head/meta[@name="author"]/@content') il.add_value('author_name', self.name) il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content') il.add_css('content_html', '.news-single-item h7 font strong') il.add_css('content_html', '.news-single-item') yield il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*" ), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath('//meta[@property="og:image"]/@content').extract_first() ), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url=f"https://{self.name}", timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*"), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath( '//meta[@property="og:image"]/@content').extract_first()), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url=self._base_url ) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader(response=response, parent=response.meta["il"], base_url=self._base_url) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta['il'], base_url=self._base_url, ) il.add_value('content_html', '<h1>Detailed Changelog</h1>') il.add_xpath('content_html', '//h1/following-sibling::*') yield il.load_item()
def _parse_news(self, response): il = FeedEntryItemLoader(response=response, parent=response.meta["il"]) il.add_xpath( "content_html", '//div[@class="newsheader" and .//a[@id="{}"]]' '/following-sibling::div[@class="newsinner"]'.format( response.meta["news_id"] ), ) return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna" ) il.add_xpath("title", "h1/text()") il.add_value("link", response.url) il.add_xpath("content_html", "h1/following-sibling::*") il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0]) il.add_value("author_name", self.name) return il.load_item()
def parse_item(self, response): remove_elems = ['h1', '.delayed-image-load'] change_tags = {'noscript': 'div'} il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, change_tags=change_tags, base_url='http://{}'.format(self.name)) il.add_xpath('content_html', '//div[@id="main-inner"]') yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna") il.add_xpath("title", "h1/text()") il.add_value("link", response.url) il.add_xpath("content_html", "h1/following-sibling::*") il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0]) il.add_value("author_name", self.name) return il.load_item()
def parse_node(self, response, node): url = node.xpath("rss:loc/text()").extract_first() il = FeedEntryItemLoader(selector=node) il.add_value("link", url) il.add_xpath("title", "news:news/news:title/text()") keywords = node.xpath("news:news/news:keywords/text()").extract_first() if keywords: il.add_value("category", keywords.split(", ")) il.add_xpath("updated", "news:news/news:publication_date/text()") return scrapy.Request( url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]} )
def parse_node(self, response, node): il = FeedEntryItemLoader(selector=node) url = node.xpath("link/text()").extract_first() il.add_value("link", url) il.add_xpath("updated", "pubDate/text()") il.add_xpath( "title", "title/text()", # Use re.DOTALL since some titles have newlines in them. re=re.compile("(?:Artikel|Tagebuch): (.*)", re.DOTALL), ) return scrapy.Request(url, self._parse_article, meta={"il": il})
def parse_item(self, response): remove_elems = ["h1", ".delayed-image-load"] change_tags = {"noscript": "div"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url="https://www.{}".format(self.name), ) il.add_xpath("content_html", '//div[@id="main-inner"]') return il.load_item()
def _parse_article(self, response): remove_elems = [ ".FeatureByline", ".GAByline", ".Form", "form", ".MakeALink", "br", ] change_tags = {"div.BigQuote": "blockquote"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url=f"https://{self.name}", ) text = response.css(".ArticleText").extract_first() # Remove 'Log in to post comments'. text = re.sub(r'<hr width="60%" align="left">.*to post comments\)', "", text, flags=re.S) il.add_css("title", "h1::text") il.add_value("content_html", text) il.add_css("author_name", ".FeatureByline b ::text") il.add_css("author_name", ".GAByline a ::text") il.add_css( "author_name", ".GAByline p ::text", re="This article was contributed by (.*)", ) il.add_xpath( "updated", '//div[@class="FeatureByline"]/text()[preceding-sibling::br]', TakeFirst(), ) il.add_xpath("updated", '//div[@class="GAByline"]/p[1]/text()') # Last resort if date cannot be extracted and it's a weekly edition. if "updated" in response.meta: il.add_value("updated", response.meta["updated"]) if response.css(".MakeALink"): # Get subscriber link for paywalled content. return scrapy.FormRequest.from_response( response, formcss=".MakeALink form", callback=self._subscriber_link, meta={"il": il}, ) else: il.add_value("link", response.url) return il.load_item()
def parse_node(self, response, node): url = node.xpath("rss:loc/text()").extract_first() il = FeedEntryItemLoader(selector=node) il.add_value("link", url) il.add_xpath("title", "news:news/news:title/text()") keywords = node.xpath("news:news/news:keywords/text()").extract_first() if keywords: il.add_value("category", keywords.split(", ")) il.add_xpath("updated", "news:news/news:publication_date/text()") return scrapy.Request(url, self.parse_item, meta={ "il": il, "handle_httpstatus_list": [404] })
def parse_content(self, response): parts = self._extract_parts(response) il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", dayfirst=True ) il.add_value("path", self._library) il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)])) il.add_value("link", response.url) il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)") _content = ["<ul>"] for part in parts: _content.append("<li>{}</li>".format(part)) _content.append("</ul>") il.add_value("content_html", "".join(_content)) return il.load_item()
def parse_node(self, response, node): # Reuse most of the existing fields il = FeedEntryItemLoader(selector=node, base_url=self.feed_link) il.add_xpath("title", "atom:title/text()") il.add_xpath("link", "atom:link/@href") il.add_xpath("author_name", "atom:author/atom:name/text()") il.add_xpath("author_email", "atom:author/atom:email/text()") il.add_xpath("updated", "atom:updated/text()") # All news items are stored on a single page and may be referred to via # an ID. Extract an item's id and use it to subsequently extract the # corresponding news text. url, news_id = node.xpath("atom:link/@href").extract_first().split("#") return scrapy.Request( url, self._parse_news, dont_filter=True, meta={"news_id": news_id, "il": il} )
def parse_node(self, response, node): il = FeedEntryItemLoader(response=response, base_url='http://{}'.format(self.name), dayfirst=True) il.add_xpath('updated', '//pubDate/text()') il.add_value('author_name', node.xpath('//dc:creator/text()').extract_first()) il.add_xpath('category', '//category/text()') title = node.xpath('(//title)[2]/text()').extract() if not title: # Fallback to the first category if no title is provided # (e.g. comic). title = response.xpath('//category/text()').extract_first() il.add_value('title', title) link = node.xpath('(//link)[2]/text()').extract_first() il.add_value('link', link) return scrapy.Request(link, self._parse_article, meta={'il': il})
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value( "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})") ) il.add_css("content_html", ".Content") return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader(response=response, base_url='http://{}'.format(self.name), timezone=self._timezone, dayfirst=True) il.add_value('link', response.url) il.add_xpath('title', '//meta[@name="title"]/@content', re='(?s)(.*?)(?: vom .*)? - puls4\.com') il.add_value('updated', '{} {}'.format( response.xpath('//meta[@name="title"]/@content'). re_first(r'.*vom (\d{2}\.\d{2}\.\d{4}).*'), response.meta['time'] or '00:00') ) il.add_value('content_html', '<img src="{}">'.format( response.xpath('//meta[@property="og:image"]/@content'). extract_first())) il.add_css('content_html', '.player-video-description-intro::text') yield il.load_item()
def parse_content(self, response): parts = self._extract_parts(response) il = FeedEntryItemLoader(response=response, timezone="Europe/Vienna", dayfirst=True) il.add_value("path", self._library) il.add_value("title", " - ".join(parts[:self._find_first_meta(parts)])) il.add_value("link", response.url) il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)") _content = ["<ul>"] for part in parts: _content.append(f"<li>{part}</li>") _content.append("</ul>") il.add_value("content_html", "".join(_content)) return il.load_item()
def parse_content(self, response): parts = self._extract_parts(response) il = FeedEntryItemLoader(response=response, timezone='Europe/Vienna', dayfirst=True) il.add_value('path', self._library) il.add_value('title', ' - '.join(parts[:self._find_first_meta(parts)])) il.add_value('link', response.url) il.add_xpath('updated', '//td/span/text()', re='In der Bibliothek seit: (.*)') _content = ['<ul>'] for part in parts: _content.append('<li>{}</li>'.format(part)) _content.append('</ul>') il.add_value('content_html', ''.join(_content)) yield il.load_item()
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value("updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")) il.add_css("content_html", ".Content") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)") ) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})" ), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);" ), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[ ".news-latest-date", ".news-single-rightbox", "hr", "h7" ], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})"), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);"), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def parse_program(self, response): if not response.css(r".jsb_video\/FlashPlayer"): return data = json.loads( response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0] ) data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", data["clipurl"]) il.add_value("title", data["programname"]) il.add_value("updated", data["airdate"]) il.add_xpath("content_html", '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if item["updated"] + self._timerange > datetime.now(timezone.utc): return item
def _parse_item(self, response): remove_elems = [ 'h1', '.nono', '.acceptance_org', '.state', 'script', '.gentics-portletreload-position-notvisibleposition' ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = { 'abbr': 'span', } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True) il.add_value('link', response.url) il.add_xpath( 'author_name', '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css('title', 'title::text', re=r'HELP.gv.at:\s*(.*)') il.add_value('updated', response.css('.state').re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_css('content_html', '.Content') yield il.load_item()
def parse_program(self, response): if not response.css(r".jsb_video\/FlashPlayer"): return data = json.loads( response.css(r".jsb_video\/FlashPlayer").xpath( "@data-jsb").extract()[0]) data = data["config"]["initial_video"]["parts"][0]["tracking"][ "nurago"] il = FeedEntryItemLoader( response=response, base_url=f"https://{self.name}", timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", data["clipurl"]) il.add_value("title", data["programname"]) il.add_value("updated", data["airdate"]) il.add_xpath("content_html", '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if item["updated"] + self._timerange > datetime.now(timezone.utc): return item
def parse_program(self, response): if not response.css('.jsb_video\/FlashPlayer'): return data = ( json.loads(response.css('.jsb_video\/FlashPlayer').xpath( '@data-jsb').extract()[0]) ) data = ( data['config']['initial_video']['parts'][0]['tracking']['nurago'] ) il = FeedEntryItemLoader(response=response, base_url='http://{}'.format(self.name), timezone=self._timezone, dayfirst=True) il.add_value('link', data['clipurl']) il.add_value('title', data['programname']) il.add_value('updated', data['airdate']) il.add_xpath('content_html', '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if (item['updated'] + self._timerange > delorean.utcnow().shift(self._timezone)): yield item
def _parse_article(self, response): remove_elems = [ '.FeatureByline', '.GAByline', '.Form', 'form', '.MakeALink', 'br' ] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, base_url='https://{}'.format(self.name), dayfirst=True) text = response.css('.ArticleText').extract_first() # Remove 'Log in to post comments'. text = re.sub(r'<hr width="60%" align="left">.*to post comments\)', '', text, flags=re.S) il.add_css('title', 'h1::text') il.add_value('content_html', text) il.add_css('author_name', '.FeatureByline b ::text') il.add_xpath( 'updated', '//div[@class="FeatureByline"]/text()[preceding-sibling::br]', TakeFirst()) il.add_xpath('updated', '//div[@class="GAByline"]/p[1]/text()') # Last resort if date cannot be extracted and it's a weekly edition. if 'updated' in response.meta: il.add_value('updated', response.meta['updated']) if response.css('.MakeALink'): # Get subscriber link for paywalled content. yield scrapy.FormRequest.from_response( response, formcss='.MakeALink form', callback=self._subscriber_link, meta={'il': il}) else: il.add_value('link', response.url) yield il.load_item()
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone="Europe/Berlin", base_url=self._base_url ) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return scrapy.Request( response.url.replace("notes-", "changelog-"), self.parse_release_changelog, meta={"il": il}, )
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone="Europe/Berlin", base_url=self.feed_link, remove_elems=[".cookielaw-banner"], ) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_release_notes(self, response): il = FeedEntryItemLoader( response=response, timezone=self._timezone, base_url=self._base_url, ) il.add_xpath('title', '//h1/text()') il.add_value('link', response.url) il.add_xpath('updated', '//div[@class="docInfo"]', re='Last modified: (.*) by') il.add_value('content_html', '<h1>Release Notes</h1>') il.add_xpath('content_html', '//h1/following-sibling::*') yield scrapy.Request(response.url.replace('notes-', 'changelog-'), self.parse_release_changelog, meta={'il': il})
def parse_release_notes(self, response): il = FeedEntryItemLoader(response=response, timezone="Europe/Berlin", base_url=self._base_url) il.add_xpath("title", "//h1/text()") il.add_value("link", response.url) il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by") il.add_value("content_html", "<h1>Release Notes</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return scrapy.Request( response.url.replace("notes-", "changelog-"), self.parse_release_changelog, meta={"il": il}, )
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@id="maincontentbook"]'), base_url=self.feed_link, ) il.add_xpath("title", '//h1[@class="p_book_title"]/text()') il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()') il.add_value("link", response.url) il.add_value("author_name", self.feed_title) il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()') il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()') il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()') il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()') il.add_xpath("content_html", '//div[@class="bookcontent"]//text()') il.add_xpath("content_html", '//div[@class="p_book_image"]/img') il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()') return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@id="maincontentbook"]')) il.add_xpath('title', '//h1[@class="p_book_title"]/text()') il.add_xpath('title', '//h3[@class="p_book_title_ebook"]/text()') il.add_value('link', response.url) il.add_value('author_name', self._title) il.add_xpath('content_html', '//h1[@class="p_book_title"]/text()') il.add_xpath('content_html', '//h2[@class="p_book_author"]/text()') il.add_xpath('content_html', '//p[@class="p_book_publisher"]/text()') il.add_xpath('content_html', '//p[@class="p_book_isbn"]/text()') il.add_xpath('content_html', '(//span[@class="txt10px"])[1]/text()') il.add_xpath('content_html', '(//span[@class="txt10px"])[3]/text()') il.add_xpath('content_html', '//div[@class="bookcontent"]//text()') il.add_xpath('content_html', '//div[@class="p_book_image"]/img') il.add_xpath('content_html', '//span[@style="color:red;"]/b/text()') # NOTE: The page does not provide any usable timestamp so we convert # the bok_id parameter to unix epoch. bok_id = w3lib.url.url_query_parameter(response.url, 'bok_id', '0') timestamp = datetime.datetime.utcfromtimestamp(int(bok_id)) il.add_value('updated', timestamp.isoformat()) yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@id="maincontentbook"]'), base_url=self.feed_link, ) il.add_xpath("title", '//h1[@class="p_book_title"]/text()') il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()') il.add_value("link", response.url) il.add_value("author_name", self.feed_title) il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()') il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()') il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()') il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()') il.add_xpath("content_html", '//div[@class="bookcontent"]//text()') il.add_xpath("content_html", '//div[@class="p_book_image"]/img') il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()') return il.load_item()