Python from_any Beispiele

Programmiersprache: Python

Namespace / Paketname: newslynx.lib.url

Methode / Funktion: from_any

Beispiele auf hotexamples.com: 3

Python from_any - 3 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die newslynx.lib.url.from_any, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: article.py Projekt: abelsonlive/newslynx-core

def extract(source_url, **kw):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """
    type = kw.get('type', 'article')

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = make_soup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'authors': author.extract(soup),
        'type': type,
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['type'] = 'video'
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if data['type'] == 'article':
        if settings.EMBEDLY_ENABLED:
            data['body'] = body_via_embedly(canonical_url)
        if not data['body']:
            data['body'] = body_via_readability(page_html, canonical_url)

        # # extract body from article tag
        body, raw_html = body_via_article_tag(soup, canonical_url)

        # merge body
        if not data['body']:
            data['body'] = body

        # get creators from raw article html
        if not len(data['authors']) and raw_html:
            data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

            # remove site name from authors
            if data.get('site_name'):
                data['authors'] = [
                    a.replace(data['site_name'].upper(), "").strip()
                    for a in data['authors']
                ]

        # get links from raw_html + content
        links = [u for u in url.from_any(data['body']) if source_url not in u]
        for u in url.from_any(raw_html, source=source_url):
            if u not in links and (u != source_url or not u.startswith(source_url)):
                links.append(u)

        # split out internal / external links / article links
        data['links'] = links

    return data

Beispiel #2

Datei anzeigen

Datei: test_url.py Projekt: abelsonlive/newslynx-core

 def test_from_any_html(self):
     u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj '
     assert('http://www.nytimes.com' in url.from_any(u))

Beispiel #3

Datei anzeigen

Datei: test_url.py Projekt: newslynx/newslynx-core

 def test_from_any_html(self):
     u = 'fds;lfjdlskafjsldak <a href="http://www.nytimes.com"></a> asdlifjkasdlkfj '
     assert ('http://www.nytimes.com' in url.from_any(u))