def format(self, obj): """ For now all of these options are standard to twitter events. """ # set the status. obj['status'] = self.options.get('event_status', 'pending') # prepare url (these are formatted as redirects). obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False) # ignore bad domains / org's own domains. if self._is_bad_domain(obj['url']): return # extract and merge article data. if url.is_article(obj['url']): data = article.extract(obj['url'], type=None) if data: obj.update(data) obj.pop('type', None) obj.pop('site_name', None) obj.pop('favicon', None) # set source id: _id = obj.pop('id', obj.get('url', gen_uuid())) if ":" in _id: _id = _id.split(':')[-1] obj['source_id'] = _id # TODO: Make formatting more elegant. if self.options.get('set_event_title', None): obj['title'] = self.options.get( 'set_event_title').format(**self._fmt(obj)) if self.options.get('set_event_description', None): obj['description'] = self.options.get( 'set_event_description').format(**self._fmt(obj)) if self.options.get('set_event_tag_ids', None) and \ len(self.options.get('set_event_tag_ids')): obj['tag_ids'] = self.options.get('set_event_tag_ids') # hack because the app cant handle this field being a list. if self.options.get('set_event_content_items', None): if 'content_item_ids' not in obj: obj['content_item_ids'] = [] for c in self.options.get('set_event_content_items', []): if isinstance(c, dict): if c.get('id', None): obj['content_item_ids'].append(c.get('id')) elif isinstance(c, int): obj['content_item_ids'].append(c) # filter links. if self.options.get('must_link', False) \ and not len(obj.get('links', [])): return None return obj
def extract_articles(feed_url, domains=[]): """ Parse entries from an rss feed, extract article urls, and run article extraction an each """ entries = FeedExtractor(feed_url, domains).run() urls = [e['url'] for e in entries if url.is_article(e.get('url'))] p = Pool(len(entries)) for i, a in enumerate(p.imap_unordered(article.extract, urls)): yield a
def is_article_url(self, u, **kw): """ Overridable function for testing whether an article leads to a url. """ if not u: return False p = kw.get('pattern', None) if not p: if NEWSLYNX_IMPORTED: return is_article(u) else: raise PageOneError( 'If newslynx is not imported, A "pattern" is required to identify ' 'which link urls point to articles.') if not isinstance(p, RE_TYPE): p = re.compile(p) return p.search(u) is not None