def parse(self, response):
        html = hext.Html(response.text)
        quotes = self.rule_quotes.extract(html)
        yield from quotes

        next_page_url = self.rule_next_page_url.extract(html)
        url = next(iter(next_page_url), None)
        if url is not None:
            yield scrapy.Request(response.urljoin(url["next_page_url"]))
Exemple #2
0
def test_rule_extracts_list_of_dict():
    html = hext.Html("<html><body>💩</body></html>")
    rule = hext.Rule("<body @text:body />")
    result = rule.extract(html)
    assert type(result) is list
    assert len(result) is 1
    assert type(result[0]) is dict
    assert "body" in result[0]
    assert type(result[0]["body"]) is str
    assert result[0]["body"] == "💩"
Exemple #3
0
def no_side_effect():
    rule = hext.Rule("<div id:id><a href:x /></div>")
    html = hext.Html("""
        <div id="div">
            <a href="one.html">  <img src="one.jpg" />  </a>
            <a href="two.html">  <img src="two.jpg" />  </a>
            <a href="three.html"><img src="three.jpg" /></a>
        </div>""")
    result = rule.extract(html)
    assert type(result) is list
    assert len(result) is 1
    assert len(result[0]) is 2
    assert len(result[0]['x']) is 3
Exemple #4
0
def test_rule_extracts_empty_list():
    html = hext.Html("<html><body>💩</body></html>")
    rule = hext.Rule("<a @text:nope />")
    result = rule.extract(html)
    assert type(result) is list
    assert len(result) is 0
Exemple #5
0
def test_rule_extract_throws_on_max_search_error():
    html = hext.Html("""
        <html><body><div><span><span></span></span></div></body></html>""")
    rule = hext.Rule("<*>{<*>{<*/>}</*>}</*>")
    with pytest.raises(RuntimeError):
        rule.extract(html, 1)
Exemple #6
0
def test_html_accepts_unicode_str():
    html = hext.Html(u"<html><body>💩</body></html>")
Exemple #7
0

if __name__ == "__main__":
    docopt_args = docopt(__doc__)

    option = None
    if "build-template" in docopt_args.keys():
        option = "build-template"
        docopt_args.pop("build-template")
    elif "extract" in docopt_args.keys():
        option = "extract"
        docopt_args.pop("extract")

    # strip the -- and convert - to _, remove <>
    args = {}
    for option in docopt_args:
        args[option[2:].replace(
            '<', ''
        ).replace(
            '>', ''
        ).replace(
            '-', '_'
        )] = docopt_args[option]

    elif option == "extract":
        # TODO: walk directory, feed files to below:
        rule = hext.Rule(strhext)
        document = hext.Html(strhtml)
        result = rule.extract(document)

import hext
import requests

from ircrssfeedbot import config
from ircrssfeedbot.feed import ensure_list
from ircrssfeedbot.util.urllib import url_to_netloc

# Customize:
URL = 'https://ergo-log.com'
HEXT = '<p id="kopindex"><a href:prepend("https://ergo-log.com/"):link @text:title @text=~".+"/></p>'

user_agent = config.USER_AGENT_OVERRIDES.get(url_to_netloc(URL),
                                             config.USER_AGENT_DEFAULT)
content = requests.Session().get(URL,
                                 timeout=config.REQUEST_TIMEOUT,
                                 headers={
                                     'User-Agent': user_agent
                                 }).content
entries = hext.Rule(HEXT).extract(hext.Html(content.decode()))

for index, entry in enumerate(entries):
    title, link = html.unescape(entry['title'].strip()), entry['link'].strip()
    post = f'#{index+1}: {title}\n{link}\n'
    categories = ', '.join(
        html.unescape(c.strip())
        for c in ensure_list(entry.get('category', [])))
    if categories:
        post += f'{categories}\n'
    print(post)
Exemple #9
0
def extract_json(template, html):
    rule = hext.Rule(template)
    document = hext.Html(html)
    return rule.extract(document)
Exemple #10
0
 def __post_init__(self):
     self.html = hext.Html(self.content.decode())
Exemple #11
0
    def _entries(self) -> List[FeedEntry]:
        feed_config = self.config

        # Retrieve URL content
        content = URLReader.url_content(self.url)

        # Parse entries
        log.debug('Parsing entries for %s.', self.url)
        if feed_config.get('jmes'):
            raw_entries = jmespath.search(feed_config['jmes'],
                                          json.loads(content)) or [
                                          ]  # search can return None
            entries = [
                FeedEntry(title=e['title'].strip(),
                          long_url=e['link'].strip(),
                          categories=[
                              c.strip()
                              for c in ensure_list(e.get('category', []))
                          ]) for e in raw_entries
            ]
        elif feed_config.get('hext'):
            raw_entries = hext.Rule(feed_config['hext']).extract(
                hext.Html(content.decode()))
            entries = [
                FeedEntry(title=html.unescape(e['title'].strip()),
                          long_url=e['link'].strip(),
                          categories=[
                              html.unescape(c.strip())
                              for c in ensure_list(e.get('category', []))
                          ]) for e in raw_entries
            ]
        else:
            content = sanitize_xml(
                content
            )  # e.g. for unescaped & char in https://deepmind.com/blog/feed/basic/
            raw_entries = feedparser.parse(content.lstrip())['entries']
            entries = [
                FeedEntry(
                    title=e['title'],
                    long_url=e['link'],
                    categories=[t['term'] for t in getattr(e, 'tags', [])])
                for e in raw_entries
            ]
        log_msg = f'Parsed {len(entries)} entries for {self}.'
        if entries:
            log.debug(log_msg)
        else:
            if feed_config.get('alerts', {}).get('empty', True):
                log_msg += ' Either check the feed configuration, or wait for its next read, ' \
                           'or set `alerts/empty` to `false` for it.'
                config.runtime.alert(log_msg)
            else:
                log.warning(log_msg)

        # Deduplicate entries
        entries = self._dedupe_entries(entries, after_what='reading feed')

        # Remove blacklisted entries
        blacklist = feed_config.get('blacklist', {})
        if blacklist:
            log.debug('Filtering %s entries using blacklist for %s.',
                      len(entries), self)
            entries = [
                entry for entry in entries if not entry.listing(blacklist)
            ]
            log.debug('Filtered to %s entries using blacklist for %s.',
                      len(entries), self)

        # Keep only whitelisted entries
        whitelist = feed_config.get('whitelist', {})
        if whitelist:
            log.debug('Filtering %s entries using whitelist for %s.',
                      len(entries), self)
            explain = whitelist.get('explain')
            whitelisted_entries: List[FeedEntry] = []
            for entry in entries:
                listing = entry.listing(whitelist)
                if listing:
                    key, match = listing
                    if explain and (key == 'title'):
                        span0, span1 = match.span()
                        title = entry.title
                        entry.title = title[:span0] + '*' + title[
                            span0:span1] + '*' + title[span1:]
                    whitelisted_entries.append(entry)
            entries = whitelisted_entries
            log.debug('Filtered to %s entries using whitelist for %s.',
                      len(entries), self)

        # Enforce HTTPS URLs
        if feed_config.get('https', False):
            log.debug('Enforcing HTTPS for URLs in %s.', self)
            for entry in entries:
                if entry.long_url.startswith('http://'):
                    entry.long_url = entry.long_url.replace(
                        'http://', 'https://', 1)
            log.debug('Enforced HTTPS for URLs in %s.', self)

        # Substitute entries
        sub = feed_config.get('sub')
        if sub:
            log.debug('Substituting entries for %s.', self)
            re_sub: Callable[
                [Dict[str, str], str],
                str] = lambda r, v: re.sub(r['pattern'], r['repl'], v)
            title_sub = sub.get('title')
            if title_sub:
                for entry in entries:
                    entry.title = re_sub(title_sub, entry.title)
            url_sub = sub.get('url')
            if url_sub:
                for entry in entries:
                    entry.long_url = re_sub(url_sub, entry.long_url)
            log.debug('Substituted entries for %s.', self)

        # Format entries
        format_config = feed_config.get('format')
        if format_config:
            log.debug('Formatting entries for %s.', self)
            format_re = format_config.get('re', {})
            format_str = format_config['str']
            for entry in entries:
                params = {'title': entry.title, 'url': entry.long_url}
                for key, val in params.copy().items():
                    if key in format_re:
                        match = re.search(format_re[key], val)
                        if match:
                            params.update(match.groupdict())
                entry.title = format_str.get('title',
                                             '{title}').format_map(params)
                entry.long_url = format_str.get('url',
                                                '{url}').format_map(params)
            log.debug('Formatted entries for %s.', self)

        # Strip HTML tags from titles
        for entry in entries:
            # e.g. for http://rss.sciencedirect.com/publication/science/08999007  (Elsevier Nutrition journal)
            entry.title = html_to_text(entry.title)

        # Strip unicode quotes around titles
        quote_begin, quote_end = '“”'
        # e.g. for https://www.sciencedirect.com/science/article/abs/pii/S0899900718307883
        for entry in entries:
            title = entry.title
            if (len(title) > 2) and (title[0]
                                     == quote_begin) and (title[-1]
                                                          == quote_end):
                title = title[1:-1]
                if (quote_begin not in title) and (quote_end not in title):
                    entry.title = title

        # Replace all-caps titles
        for entry in entries:
            if entry.title.isupper(
            ):  # e.g. for https://www.biorxiv.org/content/10.1101/667436v1
                entry.title = entry.title.capitalize()

        # Truncate titles
        feed_styled = style(self.name,
                            **feed_config.get('style', {}).get('name', {}))
        for entry in entries:
            base_bytes_use = len(
                config.PRIVMSG_FORMAT.format(identity=config.runtime.identity,
                                             channel=self.channel,
                                             feed=feed_styled,
                                             title='',
                                             url=entry.post_url).encode())
            title_bytes_width = max(0, config.QUOTE_LEN_MAX - base_bytes_use)
            entry.title = shorten_to_bytes_width(entry.title,
                                                 title_bytes_width)

        # Deduplicate entries again
        entries = self._dedupe_entries(entries, after_what='processing feed')

        log.debug('Returning %s entries for %s.', len(entries), self)
        return entries
Exemple #12
0
def html_to_text(text: str) -> str:
    # Ref: https://stackoverflow.com/a/56894409/
    return _HTML_TEXT_RULE.extract(hext.Html(f'<html>{text}</html>'))[0]['text']