def parse(self, response): html = hext.Html(response.text) quotes = self.rule_quotes.extract(html) yield from quotes next_page_url = self.rule_next_page_url.extract(html) url = next(iter(next_page_url), None) if url is not None: yield scrapy.Request(response.urljoin(url["next_page_url"]))
def test_rule_extracts_list_of_dict(): html = hext.Html("<html><body>💩</body></html>") rule = hext.Rule("<body @text:body />") result = rule.extract(html) assert type(result) is list assert len(result) is 1 assert type(result[0]) is dict assert "body" in result[0] assert type(result[0]["body"]) is str assert result[0]["body"] == "💩"
def no_side_effect(): rule = hext.Rule("<div id:id><a href:x /></div>") html = hext.Html(""" <div id="div"> <a href="one.html"> <img src="one.jpg" /> </a> <a href="two.html"> <img src="two.jpg" /> </a> <a href="three.html"><img src="three.jpg" /></a> </div>""") result = rule.extract(html) assert type(result) is list assert len(result) is 1 assert len(result[0]) is 2 assert len(result[0]['x']) is 3
def test_rule_extracts_empty_list(): html = hext.Html("<html><body>💩</body></html>") rule = hext.Rule("<a @text:nope />") result = rule.extract(html) assert type(result) is list assert len(result) is 0
def test_rule_extract_throws_on_max_search_error(): html = hext.Html(""" <html><body><div><span><span></span></span></div></body></html>""") rule = hext.Rule("<*>{<*>{<*/>}</*>}</*>") with pytest.raises(RuntimeError): rule.extract(html, 1)
def test_html_accepts_unicode_str(): html = hext.Html(u"<html><body>💩</body></html>")
if __name__ == "__main__": docopt_args = docopt(__doc__) option = None if "build-template" in docopt_args.keys(): option = "build-template" docopt_args.pop("build-template") elif "extract" in docopt_args.keys(): option = "extract" docopt_args.pop("extract") # strip the -- and convert - to _, remove <> args = {} for option in docopt_args: args[option[2:].replace( '<', '' ).replace( '>', '' ).replace( '-', '_' )] = docopt_args[option] elif option == "extract": # TODO: walk directory, feed files to below: rule = hext.Rule(strhext) document = hext.Html(strhtml) result = rule.extract(document)
import hext import requests from ircrssfeedbot import config from ircrssfeedbot.feed import ensure_list from ircrssfeedbot.util.urllib import url_to_netloc # Customize: URL = 'https://ergo-log.com' HEXT = '<p id="kopindex"><a href:prepend("https://ergo-log.com/"):link @text:title @text=~".+"/></p>' user_agent = config.USER_AGENT_OVERRIDES.get(url_to_netloc(URL), config.USER_AGENT_DEFAULT) content = requests.Session().get(URL, timeout=config.REQUEST_TIMEOUT, headers={ 'User-Agent': user_agent }).content entries = hext.Rule(HEXT).extract(hext.Html(content.decode())) for index, entry in enumerate(entries): title, link = html.unescape(entry['title'].strip()), entry['link'].strip() post = f'#{index+1}: {title}\n{link}\n' categories = ', '.join( html.unescape(c.strip()) for c in ensure_list(entry.get('category', []))) if categories: post += f'{categories}\n' print(post)
def extract_json(template, html): rule = hext.Rule(template) document = hext.Html(html) return rule.extract(document)
def __post_init__(self): self.html = hext.Html(self.content.decode())
def _entries(self) -> List[FeedEntry]: feed_config = self.config # Retrieve URL content content = URLReader.url_content(self.url) # Parse entries log.debug('Parsing entries for %s.', self.url) if feed_config.get('jmes'): raw_entries = jmespath.search(feed_config['jmes'], json.loads(content)) or [ ] # search can return None entries = [ FeedEntry(title=e['title'].strip(), long_url=e['link'].strip(), categories=[ c.strip() for c in ensure_list(e.get('category', [])) ]) for e in raw_entries ] elif feed_config.get('hext'): raw_entries = hext.Rule(feed_config['hext']).extract( hext.Html(content.decode())) entries = [ FeedEntry(title=html.unescape(e['title'].strip()), long_url=e['link'].strip(), categories=[ html.unescape(c.strip()) for c in ensure_list(e.get('category', [])) ]) for e in raw_entries ] else: content = sanitize_xml( content ) # e.g. for unescaped & char in https://deepmind.com/blog/feed/basic/ raw_entries = feedparser.parse(content.lstrip())['entries'] entries = [ FeedEntry( title=e['title'], long_url=e['link'], categories=[t['term'] for t in getattr(e, 'tags', [])]) for e in raw_entries ] log_msg = f'Parsed {len(entries)} entries for {self}.' if entries: log.debug(log_msg) else: if feed_config.get('alerts', {}).get('empty', True): log_msg += ' Either check the feed configuration, or wait for its next read, ' \ 'or set `alerts/empty` to `false` for it.' config.runtime.alert(log_msg) else: log.warning(log_msg) # Deduplicate entries entries = self._dedupe_entries(entries, after_what='reading feed') # Remove blacklisted entries blacklist = feed_config.get('blacklist', {}) if blacklist: log.debug('Filtering %s entries using blacklist for %s.', len(entries), self) entries = [ entry for entry in entries if not entry.listing(blacklist) ] log.debug('Filtered to %s entries using blacklist for %s.', len(entries), self) # Keep only whitelisted entries whitelist = feed_config.get('whitelist', {}) if whitelist: log.debug('Filtering %s entries using whitelist for %s.', len(entries), self) explain = whitelist.get('explain') whitelisted_entries: List[FeedEntry] = [] for entry in entries: listing = entry.listing(whitelist) if listing: key, match = listing if explain and (key == 'title'): span0, span1 = match.span() title = entry.title entry.title = title[:span0] + '*' + title[ span0:span1] + '*' + title[span1:] whitelisted_entries.append(entry) entries = whitelisted_entries log.debug('Filtered to %s entries using whitelist for %s.', len(entries), self) # Enforce HTTPS URLs if feed_config.get('https', False): log.debug('Enforcing HTTPS for URLs in %s.', self) for entry in entries: if entry.long_url.startswith('http://'): entry.long_url = entry.long_url.replace( 'http://', 'https://', 1) log.debug('Enforced HTTPS for URLs in %s.', self) # Substitute entries sub = feed_config.get('sub') if sub: log.debug('Substituting entries for %s.', self) re_sub: Callable[ [Dict[str, str], str], str] = lambda r, v: re.sub(r['pattern'], r['repl'], v) title_sub = sub.get('title') if title_sub: for entry in entries: entry.title = re_sub(title_sub, entry.title) url_sub = sub.get('url') if url_sub: for entry in entries: entry.long_url = re_sub(url_sub, entry.long_url) log.debug('Substituted entries for %s.', self) # Format entries format_config = feed_config.get('format') if format_config: log.debug('Formatting entries for %s.', self) format_re = format_config.get('re', {}) format_str = format_config['str'] for entry in entries: params = {'title': entry.title, 'url': entry.long_url} for key, val in params.copy().items(): if key in format_re: match = re.search(format_re[key], val) if match: params.update(match.groupdict()) entry.title = format_str.get('title', '{title}').format_map(params) entry.long_url = format_str.get('url', '{url}').format_map(params) log.debug('Formatted entries for %s.', self) # Strip HTML tags from titles for entry in entries: # e.g. for http://rss.sciencedirect.com/publication/science/08999007 (Elsevier Nutrition journal) entry.title = html_to_text(entry.title) # Strip unicode quotes around titles quote_begin, quote_end = '“”' # e.g. for https://www.sciencedirect.com/science/article/abs/pii/S0899900718307883 for entry in entries: title = entry.title if (len(title) > 2) and (title[0] == quote_begin) and (title[-1] == quote_end): title = title[1:-1] if (quote_begin not in title) and (quote_end not in title): entry.title = title # Replace all-caps titles for entry in entries: if entry.title.isupper( ): # e.g. for https://www.biorxiv.org/content/10.1101/667436v1 entry.title = entry.title.capitalize() # Truncate titles feed_styled = style(self.name, **feed_config.get('style', {}).get('name', {})) for entry in entries: base_bytes_use = len( config.PRIVMSG_FORMAT.format(identity=config.runtime.identity, channel=self.channel, feed=feed_styled, title='', url=entry.post_url).encode()) title_bytes_width = max(0, config.QUOTE_LEN_MAX - base_bytes_use) entry.title = shorten_to_bytes_width(entry.title, title_bytes_width) # Deduplicate entries again entries = self._dedupe_entries(entries, after_what='processing feed') log.debug('Returning %s entries for %s.', len(entries), self) return entries
def html_to_text(text: str) -> str: # Ref: https://stackoverflow.com/a/56894409/ return _HTML_TEXT_RULE.extract(hext.Html(f'<html>{text}</html>'))[0]['text']