def format(self, obj):
        """
        For now all of these options are standard to twitter events.
        """
        # set the status.
        obj['status'] = self.options.get('event_status', 'pending')

        # prepare url (these are formatted as redirects).
        obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False)

        # ignore bad domains / org's own domains.
        if self._is_bad_domain(obj['url']):
            return

        # extract and merge article data.
        if url.is_article(obj['url']):
            data = article.extract(obj['url'], type=None)
            if data:
                obj.update(data)
                obj.pop('type', None)
                obj.pop('site_name', None)
                obj.pop('favicon', None)

        # set source id:
        _id = obj.pop('id', obj.get('url', gen_uuid()))
        if ":" in _id:
            _id = _id.split(':')[-1]
        obj['source_id'] = _id

        # TODO: Make formatting more elegant.
        if self.options.get('set_event_title', None):
            obj['title'] = self.options.get(
                'set_event_title').format(**self._fmt(obj))

        if self.options.get('set_event_description', None):
            obj['description'] = self.options.get(
                'set_event_description').format(**self._fmt(obj))

        if self.options.get('set_event_tag_ids', None) and \
           len(self.options.get('set_event_tag_ids')):

            obj['tag_ids'] = self.options.get('set_event_tag_ids')

        # hack because the app cant handle this field being a list.
        if self.options.get('set_event_content_items', None):
            if 'content_item_ids' not in obj:
                obj['content_item_ids'] = []
            for c in self.options.get('set_event_content_items', []):
                if isinstance(c, dict):
                    if c.get('id', None):
                        obj['content_item_ids'].append(c.get('id'))
                elif isinstance(c, int):
                    obj['content_item_ids'].append(c)
        # filter links.
        if self.options.get('must_link', False) \
           and not len(obj.get('links', [])):
            return None
        return obj
Beispiel #2
0
def extract_articles(feed_url, domains=[]):
    """
    Parse entries from an rss feed, extract article urls, and
    run article extraction an each
    """
    entries = FeedExtractor(feed_url, domains).run()
    urls = [e['url'] for e in entries if url.is_article(e.get('url'))]
    p = Pool(len(entries))
    for i, a in enumerate(p.imap_unordered(article.extract, urls)):
        yield a
Beispiel #3
0
def extract_articles(feed_url, domains=[]):
    """
    Parse entries from an rss feed, extract article urls, and
    run article extraction an each
    """
    entries = FeedExtractor(feed_url, domains).run()
    urls = [e['url'] for e in entries if url.is_article(e.get('url'))]
    p = Pool(len(entries))
    for i, a in enumerate(p.imap_unordered(article.extract, urls)):
        yield a
Beispiel #4
0
 def is_article_url(self, u, **kw):
     """
     Overridable function for testing whether an article
     leads to a url.
     """
     if not u:
         return False
     p = kw.get('pattern', None)
     if not p:
         if NEWSLYNX_IMPORTED:
             return is_article(u)
         else:
             raise PageOneError(
                 'If newslynx is not imported, A "pattern" is required to identify '
                 'which link urls point to articles.')
     if not isinstance(p, RE_TYPE):
         p = re.compile(p)
     return p.search(u) is not None
Beispiel #5
0
 def is_article_url(self, u, **kw):
     """
     Overridable function for testing whether an article
     leads to a url.
     """
     if not u:
         return False
     p = kw.get('pattern', None)
     if not p:
         if NEWSLYNX_IMPORTED:
             return is_article(u)
         else:
             raise PageOneError(
                 'If newslynx is not imported, A "pattern" is required to identify '
                 'which link urls point to articles.')
     if not isinstance(p, RE_TYPE):
         p = re.compile(p)
     return p.search(u) is not None