Ejemplo n.º 1
0
def from_html(htmlstring, source=None):
    """
    Extract all img urls from an html string
    """
    if not htmlstring:
        return []
    soup = make_soup(htmlstring)
    out_imgs = []

    for tag, attr in IMG_TAGS:

        for el in soup.find_all(tag):

            img_url = el.attrs.get(attr)
            if not img_url:
                continue

            # embeds.
            if img_url.startswith("//:"):
                img_url = "http{}".format(img_url)

            # only take images with known formats
            fmt = url.is_image(img_url)
            if not fmt:
                continue

            # absolutify images if we know their source.
            if img_url.startswith("/") or not img_url.startswith("http"):
                if source:
                    img_url = urljoin(source, img_url)
                else:
                    continue

            out_imgs.append(img_url)
    return uniq(out_imgs)
Ejemplo n.º 2
0
def from_html(htmlstring, source=None):
    """
    Extract all img urls from an html string
    """
    if not htmlstring:
        return []
    soup = make_soup(htmlstring)
    out_imgs = []

    for tag, attr in IMG_TAGS:

        for el in soup.find_all(tag):

            img_url = el.attrs.get(attr)
            if not img_url:
                continue

            # embeds.
            if img_url.startswith('//:'):
                img_url = "http{}".format(img_url)

            # only take images with known formats
            fmt = url.is_image(img_url)
            if not fmt:
                continue

            # absolutify images if we know their source.
            if img_url.startswith('/') or not img_url.startswith('http'):
                if source:
                    img_url = urljoin(source, img_url)
                else:
                    continue

            out_imgs.append(img_url)
    return uniq(out_imgs)
Ejemplo n.º 3
0
def make_abs(htmlstring, source_url):
    """
    Make "src" and "href" attributes absolute.
    """
    soup = make_soup(htmlstring)

    # links
    for a in soup.find_all('a'):
        href = a.attrs.get('href')
        if not href:
            continue
        if href.startswith('/') or not href.startswith('http'):
            if source_url:
                a['href'] = urljoin(source_url, href)
        elif href.startswith('#'):
            a.attrs.pop('href')

    # images
    for img in soup.find_all('img'):
        src = img.attrs.get('src')
        if not src:
            continue
        # embeds.
        if src.startswith('//:'):
            img['src'] = "http{}".format(src)
        if src.startswith('/') or not src.startswith('http'):
            if source_url:
                img['src'] = urljoin(source_url, src)
    return soup
Ejemplo n.º 4
0
def make_abs(htmlstring, source_url):
    """
    Make "src" and "href" attributes absolute.
    """
    soup = make_soup(htmlstring)

    # links
    for a in soup.find_all('a'):
        href = a.attrs.get('href')
        if not href:
            continue
        if href.startswith('/') or not href.startswith('http'):
            if source_url:
                a['href'] = urljoin(source_url, href)
        elif href.startswith('#'):
            a.attrs.pop('href')

    # images
    for img in soup.find_all('img'):
        src = img.attrs.get('src')
        if not src:
            continue
        # embeds.
        if src.startswith('//:'):
            img['src'] = "http{}".format(src)
        if src.startswith('/') or not src.startswith('http'):
            if source_url:
                img['src'] = urljoin(source_url, src)
    return soup
Ejemplo n.º 5
0
def extract(soup, tags=PESSIMISTIC_TAGS, attrs=TAG_ATTRS, vals=TAG_VALS):
    """
    Extract author attrs from meta tags.
    Only works for english articles.
    """

    # soupify
    if not isinstance(soup, BeautifulSoup):
        soup = make_soup(soup)

    # Search popular author tags for authors

    matches = []
    _authors = []
    for tag in tags:
        for attr in attrs:
            for val in vals:
                found = soup.find_all(tag, {attr: val})
                matches.extend(found)

    for match in matches:
        content = u""

        m = match.attrs.get("content", None)
        if m:
            content = m

        else:  # match.tag == <any other tag>
            content = match.text or u""  # text_content()
        if len(content) > 0:
            _authors.extend(parse(content))

    return _format(_authors)
Ejemplo n.º 6
0
def _bypass_bitly_warning(url):
    """
    Sometime bitly blocks unshorten attempts, this bypasses that.
    """
    html_string = network.get(url)
    soup = make_soup(html_string)
    a = soup.find('a', {'id': 'clickthrough'})
    if a:
        return a.attrs.get('href')
    return url
Ejemplo n.º 7
0
def _bypass_bitly_warning(url):
    """
    Sometime bitly blocks unshorten attempts, this bypasses that.
    """
    html_string = network.get(url)
    soup = make_soup(html_string)
    a = soup.find('a', {'id': 'clickthrough'})
    if a:
        return a.attrs.get('href')
    return url
Ejemplo n.º 8
0
def body_via_article_tag(soup, source_url):
    """
    Extract content from an "article" tag.
    """
    if not isinstance(soup, BeautifulSoup):
        soup = make_soup(soup)
    articles = soup.find_all('article')
    if len(articles):
        raw_html = html.get_inner(articles[0])
        body = html.prepare(raw_html, source_url)
        return body, raw_html
    return None, None
Ejemplo n.º 9
0
def from_html(htmlstring, **kw):
    """
    Extract urls from htmlstring, optionally reconciling
    relative urls + embeds + redirects.
    """
    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not htmlstring:
        return []
    final_urls = []
    if source:
        source_domain = get_domain(source)
    soup = make_soup(htmlstring)
    for tag in URL_TAGS:

        for el in soup.find_all(tag):

            for attr in URL_ATTRS:
                href = el.attrs.get(attr, None)

                if not href:
                    continue
                url = reconcile_embed(href)

                if source:
                    url = redirect_back(url, source_domain)
                    if not is_abs(url):
                        url = urljoin(source, url)

                if not is_valid(url):
                    continue
                if exclude_images:
                    if not is_image(url):
                        final_urls.append(url)
                else:
                    final_urls.append(url)
    return uniq(final_urls)
Ejemplo n.º 10
0
def from_html(htmlstring, **kw):
    """
    Extract urls from htmlstring, optionally reconciling
    relative urls + embeds + redirects.
    """
    source = kw.get('source', None)
    exclude_images = kw.get('excl_img', True)

    if not htmlstring:
        return []
    final_urls = []
    if source:
        source_domain = get_domain(source)
    soup = make_soup(htmlstring)
    for tag in URL_TAGS:

        for el in soup.find_all(tag):

            for attr in URL_ATTRS:
                href = el.attrs.get(attr, None)

                if not href:
                    continue
                url = reconcile_embed(href)

                if source:
                    url = redirect_back(url, source_domain)
                    if not is_abs(url):
                        url = urljoin(source, url)

                if not is_valid(url):
                    continue
                if exclude_images:
                    if not is_image(url):
                        final_urls.append(url)
                else:
                    final_urls.append(url)
    return uniq(final_urls)
Ejemplo n.º 11
0
def extract(
        soup,
        tags=PESSIMISTIC_TAGS,
        attrs=TAG_ATTRS,
        vals=TAG_VALS):
    """
    Extract author attrs from meta tags.
    Only works for english articles.
    """

    # soupify
    if not isinstance(soup, BeautifulSoup):
        soup = make_soup(soup)

    # Search popular author tags for authors

    matches = []
    _authors = []
    for tag in tags:
        for attr in attrs:
            for val in vals:
                found = soup.find_all(tag, {attr: val})
                matches.extend(found)

    for match in matches:
        content = u''

        m = match.attrs.get('content', None)
        if m:
            content = m

        else:  # match.tag == <any other tag>
            content = match.text or u''  # text_content()
        if len(content) > 0:
            _authors.extend(parse(content))

    return _format(_authors)
Ejemplo n.º 12
0
def extract(source_url, **kw):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """
    type = kw.get('type', 'article')

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = make_soup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'authors': author.extract(soup),
        'type': type,
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['type'] = 'video'
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if data['type'] == 'article':
        if settings.EMBEDLY_ENABLED:
            data['body'] = body_via_embedly(canonical_url)
        if not data['body']:
            data['body'] = body_via_readability(page_html, canonical_url)

        # # extract body from article tag
        body, raw_html = body_via_article_tag(soup, canonical_url)

        # merge body
        if not data['body']:
            data['body'] = body

        # get creators from raw article html
        if not len(data['authors']) and raw_html:
            data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

            # remove site name from authors
            if data.get('site_name'):
                data['authors'] = [
                    a.replace(data['site_name'].upper(), "").strip()
                    for a in data['authors']
                ]

        # get links from raw_html + content
        links = [u for u in url.from_any(data['body']) if source_url not in u]
        for u in url.from_any(raw_html, source=source_url):
            if u not in links and (u != source_url or not u.startswith(source_url)):
                links.append(u)

        # split out internal / external links / article links
        data['links'] = links

    return data
Ejemplo n.º 13
0
def prepare(url, source=None, canonicalize=True, expand=True, keep_params=KEEP_PARAMS):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    if not url or url == "":
        return None

    # encode.
    url = url.encode('utf-8', errors='ignore')

    # reconcile embeds:
    url = reconcile_embed(url)

    # reconcile redirects
    url = redirect_back(url, source)

    # check for non absolute urls.
    if source:
        source_domain = get_domain(source)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check for missing scheme
    if not get_scheme(url):
        url = "http://" + url

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = make_soup(page_html)
            _url = meta.canonical_url(soup)
            if _url:
                url = _url

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url
Ejemplo n.º 14
0
def prepare(url,
            source=None,
            canonicalize=True,
            expand=True,
            keep_params=KEEP_PARAMS):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    if not url or url == "":
        return None

    # encode.
    url = url.encode('utf-8', errors='ignore')

    # reconcile embeds:
    url = reconcile_embed(url)

    # reconcile redirects
    url = redirect_back(url, source)

    # check for non absolute urls.
    if source:
        source_domain = get_domain(source)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check for missing scheme
    if not get_scheme(url):
        url = "http://" + url

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = make_soup(page_html)
            _url = meta.canonical_url(soup)
            if _url:
                url = _url

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url