def parse_entry(self, entry): """ Parse an entry in an RSS feed. """ entry_url = self.get_url(entry) # merge description with body body = self.get_body(entry) description = self.get_description(entry) if not body: body = description description = None return { 'id': entry.id, 'url': entry_url, 'domain': url.get_domain(entry_url), 'body': html.prepare(body, entry_url), 'title': self.get_title(entry), 'description': html.prepare(description, entry_url), 'tags': self.get_tags(entry), 'authors': self.get_authors(entry), 'created': self.get_created(entry), 'img_url': self.get_img_url(entry, body), 'links': self.get_links(body, entry_url) }
def body_via_readability(page_html, source_url): """ Readbility is good at article + title. """ obj = Document(page_html) body = obj.summary() if not body: return None return html.prepare(body, source_url)
def _prepare_str(o, field, source_url=None): """ Prepare text/html field """ if field not in o: return None if o[field] is None: return None if html.is_html(o[field]): return html.prepare(o[field], source_url) return text.prepare(o[field])
def prepare_str(o, field, source_url=None): """ Prepare text/html field """ if field not in o: return None if o[field] is None: return None if html.is_html(o[field]): return html.prepare(o[field], source_url) return text.prepare(o[field])
def body_via_article_tag(soup, source_url): """ Extract content from an "article" tag. """ if not isinstance(soup, BeautifulSoup): soup = BeautifulSoup(soup) articles = soup.find_all('article') if len(articles): raw_html = html.get_inner(articles[0]) body = html.prepare(raw_html, source_url) return body, raw_html return None, None
def body_via_embedly(source_url): """ Use Embed.ly's API for content extraction. """ # make request to embedly api e = embedly_api.extract(source_url) # check for errors. if e['type'] == 'error': return None return html.prepare(e.get('content'), source_url)
def parse_entry(self, entry): """ Parse an entry in an RSS feed. """ entry_url = self.get_url(entry) # merge description with body body = self.get_body(entry) description = self.get_description(entry) if not body: body = description description = None return { 'url': entry_url, 'body': html.prepare(body, entry_url), 'title': self.get_title(entry), 'description': html.prepare(description, entry_url), 'tags': self.get_tags(entry), 'authors': self.get_authors(entry), 'created': self.get_created(entry), 'img_url': self.get_img_url(entry, body), 'links': self.get_links(body, entry_url), }