Python from_any Examples

Programming Language: Python

Namespace/Package Name: newslynx.lib.url

Method/Function: from_any

Examples at hotexamples.com: 3

Python from_any - 3 examples found. These are the top rated real world Python examples of newslynx.lib.url.from_any extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: article.py Project: abelsonlive/newslynx-core

def extract(source_url, **kw):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """
    type = kw.get('type', 'article')

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = make_soup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'authors': author.extract(soup),
        'type': type,
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['type'] = 'video'
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if data['type'] == 'article':
        if settings.EMBEDLY_ENABLED:
            data['body'] = body_via_embedly(canonical_url)
        if not data['body']:
            data['body'] = body_via_readability(page_html, canonical_url)

        # # extract body from article tag
        body, raw_html = body_via_article_tag(soup, canonical_url)

        # merge body
        if not data['body']:
            data['body'] = body

        # get creators from raw article html
        if not len(data['authors']) and raw_html:
            data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

            # remove site name from authors
            if data.get('site_name'):
                data['authors'] = [
                    a.replace(data['site_name'].upper(), "").strip()
                    for a in data['authors']
                ]

        # get links from raw_html + content
        links = [u for u in url.from_any(data['body']) if source_url not in u]
        for u in url.from_any(raw_html, source=source_url):
            if u not in links and (u != source_url or not u.startswith(source_url)):
                links.append(u)

        # split out internal / external links / article links
        data['links'] = links

    return data

Example #2

Show file

File: test_url.py Project: abelsonlive/newslynx-core

 def test_from_any_html(self):
     u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj '
     assert('http://www.nytimes.com' in url.from_any(u))

Example #3

Show file

File: test_url.py Project: newslynx/newslynx-core

 def test_from_any_html(self):
     u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj '
     assert ('http://www.nytimes.com' in url.from_any(u))