Example #1
0
 def get_data_from_page(self, page, url):
     try:
         feed = parse_feed(StringIO(page+''))
     except TypeError:
         feed = parse_feed(StringIO(str(page)))
     self.fact.cache[url] = time.time()
     return feed
Example #2
0
 def get_data_from_page(self, page_content, url):
     if not page_content:
         # empty result from ConditionalGetPage when Last-Modified header not changed
         return
     self.fact.cache[url] = time.time()
     if self.fact.name == "pages":
         return page_content
     try:
         feed = parse_feed(StringIO(page_content + ''))
     except TypeError:
         feed = parse_feed(StringIO(str(page_content)))
     return feed
Example #3
0
 def get_data_from_page(self, page_content, url):
     if not page_content:
         # empty result from ConditionalGetPage when Last-Modified header not changed
         return
     self.fact.cache[url] = time.time()
     if self.fact.name == "pages":
         return page_content
     try:
         feed = parse_feed(StringIO(page_content+''))
     except TypeError:
         feed = parse_feed(StringIO(str(page_content)))
     return feed
Example #4
0
def parse_tweets(raw_tweets, source, now=None):
    """
        Parses a list of raw tweet lines from a twtxt file
        and returns a list of :class:`Tweet` objects.

        :param list raw_tweets: list of raw tweet lines
        :param Source source: the source of the given tweets
        :param Datetime now: the current datetime

        :returns: a list of parsed tweets :class:`Tweet` objects
        :rtype: list
    """
    if now is None:
        now = datetime.now(timezone.utc)

    dom = parse_feed('\n'.join(raw_tweets))
    if dom.bozo:
        tweets = []
        for line in raw_tweets:
            try:
                tweet = parse_tweet(line, source, now)
            except (ValueError, OverflowError) as e:
                logger.debug("{0} - {1}".format(source.url, e))
            else:
                tweets.append(tweet)
    else:
        tweets = [
            Tweet(
                click.unstyle(m.title.strip()) + ' ' + m.links[0].href,
                parse_iso8601(m.updated),
                source
                ) for m in dom.entries
            ]

    return tweets
Example #5
0
 def feed(self) -> Feed:
     if self._feed:
         return self._feed
     if self.feed_config.save_bandwith:
         raw_feed = parse_feed(
             self.feed_config.source,
             etag=self.feed_config.etag,
             modified=self.feed_config.modified,
         )
     else:
         raw_feed = parse_feed(self.feed_config.source)
     if raw_feed.status == 304:
         return Feed()
     self.feed_config.etag = raw_feed.etag
     self.feed_config.modified = raw_feed.modified
     self._feed = Feed.from_feedparser(raw_feed)
     return self._feed
Example #6
0
File: rss.py Project: prologic/kdb
    def __call__(self):
        d = parse_feed(self.url)

        if self.title == "" and self.link == "":
            self.title = getattr(d.feed, "title", "")
            self.link = getattr(d.feed, "link", "")

        new = []
        for v in d.entries:
            e = {
                "time": mktime(v.updated_parsed),
                "title": v.title,
                "summary": html2text(v.summary).strip().split("\n")[0],
                "link": v.links[0].href
            }

            if e not in self.entries:
                self.entries.append(e)
                new.append(e)

        if not new == []:
            s = []
            s.append("RSS: {0:s} ({1:s})".format(self.title, self.link))

            for e in new[:3]:
                x = sum([len(e["title"]), len(e["summary"]), len(e["link"])])
                if x > 450:
                    y = sum([len(e["title"]), len(e["link"])])
                    s.append(
                        " * {0:s}: {1:s} ... <{2:s}>".format(
                            e["title"],
                            e["summary"][:(450 - y)],
                            e["link"]
                        )
                    )
                else:
                    s.append(
                        " * {0:s}: {1:s} <{2:s}>".format(
                            e["title"],
                            e["summary"],
                            e["link"]
                        )
                    )
            return s
        else:
            return []
Example #7
0
    async def fetch(self) -> FeedParserDict:
        headers = ({} if self.user_agent is None else {
            'User-Agent': self.user_agent
        })
        async with ClientSession(headers=headers) as session:
            async with session.get(self.url) as response:
                if response.status != 200:
                    raise FeedError(f'Feed {self.name!r}: error sending ' +
                                    f'request to {self.url!r}')
                text = await response.text()

        rss = parse_feed(text)
        if rss['bozo']:
            raise FeedError(
                f'Feed {self.name!r}: error parsing url {self.url!r}'
            ) from rss['bozo_exception']

        await logging.info(f'Feed {self.name!r}: downloaded url {self.url!r}')
        return rss
Example #8
0
def make_site_with_rssfeed_readable_again(url, filename, is_clean):
    """Convert feed to an HTML."""
    with open(filename, 'w') as file_object:
        print "\nOPENING URL: " + url + "\n\n"
        headers = {
            'User-Agent':
            APP_BRANDNAME + '/' + APP_RELEASE +
            ' (Unix; Intel OS Nine 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        mystr = response.text

        # remove heigh and width in images because CSS will do that
        mystr = mystr.replace(u"height=", "whatever=")
        mystr = mystr.replace(u"width=", "whatever=")

        # remove unwanted strings in output
        mystr = mystr.replace(u'<hr id=', '<hr class="spenden" id=')
        mystr = mystr.replace(u"<p><strong>Hilf mit!</strong>", "")
        mystr = mystr.replace(
            u"Mit Deiner finanziellen Hilfe unterstützt Du unabhängigen Journalismus.",
            "")

        if APP_DEBUG:
            print "FEED:\n" + str(mystr) + "\n****************************"

        feedtitle = None
        try:
            root = parse_feed(mystr)
            entries = root.entries

            # access feedtitle
            feedtitle = root.feed.title
        except Exception, e:
            print "PARSING-ERROR: " + str(e)
            print(traceback.format_exc())
            pass

        if not feedtitle:
            feedtitle = DEFAULT_TITLE

        if is_clean:
            template = APP_PATH + '/' + 'template_clean.html'
        else:
            template = APP_PATH + '/' + 'template_readable.html'

        if APP_DEBUG:
            print "\n ENTRIES TO RENDER: " + str(len(entries)) + "\n"

        last_entry_link = entries[len(entries) - 1].link
        html_footer = site_footer_html()
        html_content = Template(filename=template,
                                output_encoding='utf-8').render(
                                    last_entry_link=last_entry_link,
                                    num_of_entries=len(entries),
                                    feedurl=url,
                                    entries=entries,
                                    feedtitle=feedtitle,
                                    footer=html_footer)

        if APP_DEBUG:
            print "HTML:\n" + html_content + "\n****************************"

        if is_clean:
            clean = Document(html_content)
            file_object.write(clean.content())

        else:
            file_object.write(html_content)
Example #9
0
async def parse(content):
    return parse_feed(content)