Example #1
0
    def parse_entry(self, entry):
        """
        Parse an entry in an RSS feed.
        """
        entry_url = self.get_url(entry)

        # merge description with body
        body = self.get_body(entry)
        description = self.get_description(entry)
        if not body:
            body = description
            description = None

        return {
            'id': entry.id,
            'url': entry_url,
            'domain': url.get_domain(entry_url),
            'body':  html.prepare(body, entry_url),
            'title': self.get_title(entry),
            'description': html.prepare(description, entry_url),
            'tags': self.get_tags(entry),
            'authors': self.get_authors(entry),
            'created': self.get_created(entry),
            'img_url': self.get_img_url(entry, body),
            'links': self.get_links(body, entry_url)
        }
Example #2
0
def body_via_readability(page_html, source_url):
    """
    Readbility is good at article + title.
    """

    obj = Document(page_html)
    body = obj.summary()
    if not body:
        return None
    return html.prepare(body, source_url)
Example #3
0
def body_via_readability(page_html, source_url):
    """
    Readbility is good at article + title.
    """

    obj = Document(page_html)
    body = obj.summary()
    if not body:
        return None
    return html.prepare(body, source_url)
Example #4
0
def _prepare_str(o, field, source_url=None):
    """
    Prepare text/html field
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if html.is_html(o[field]):
        return html.prepare(o[field], source_url)
    return text.prepare(o[field])
Example #5
0
def prepare_str(o, field, source_url=None):
    """
    Prepare text/html field
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if html.is_html(o[field]):
        return html.prepare(o[field], source_url)
    return text.prepare(o[field])
Example #6
0
def body_via_article_tag(soup, source_url):
    """
    Extract content from an "article" tag.
    """
    if not isinstance(soup, BeautifulSoup):
        soup = BeautifulSoup(soup)
    articles = soup.find_all('article')
    if len(articles):
        raw_html = html.get_inner(articles[0])
        body = html.prepare(raw_html, source_url)
        return body, raw_html
    return None, None
Example #7
0
def body_via_article_tag(soup, source_url):
    """
    Extract content from an "article" tag.
    """
    if not isinstance(soup, BeautifulSoup):
        soup = BeautifulSoup(soup)
    articles = soup.find_all('article')
    if len(articles):
        raw_html = html.get_inner(articles[0])
        body = html.prepare(raw_html, source_url)
        return body, raw_html
    return None, None
Example #8
0
def body_via_embedly(source_url):
    """
    Use Embed.ly's API for content extraction.
    """

    # make request to embedly api
    e = embedly_api.extract(source_url)

    # check for errors.
    if e['type'] == 'error':
        return None

    return html.prepare(e.get('content'), source_url)
Example #9
0
def body_via_embedly(source_url):
    """
    Use Embed.ly's API for content extraction.
    """

    # make request to embedly api
    e = embedly_api.extract(source_url)

    # check for errors.
    if e['type'] == 'error':
        return None

    return html.prepare(e.get('content'), source_url)
Example #10
0
    def parse_entry(self, entry):
        """
        Parse an entry in an RSS feed.
        """
        entry_url = self.get_url(entry)

        # merge description with body
        body = self.get_body(entry)
        description = self.get_description(entry)
        if not body:
            body = description
            description = None

        return {
            'url': entry_url,
            'body': html.prepare(body, entry_url),
            'title': self.get_title(entry),
            'description': html.prepare(description, entry_url),
            'tags': self.get_tags(entry),
            'authors': self.get_authors(entry),
            'created': self.get_created(entry),
            'img_url': self.get_img_url(entry, body),
            'links': self.get_links(body, entry_url),
        }