Beispiel #1
0
def prepare(url,
            source=None,
            canonicalize=True,
            expand=True,
            keep_params=('id', 'p', 'v')):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    # reconcile embeds:
    url = reconcile_embed(url)

    # check for redirects / non absolute urls
    if source:
        source_domain = get_domain(source)

        # reconcile redirects
        url = redirect_back(url, source_domain)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = BeautifulSoup(page_html)
            canonical = meta.canonical_url(soup)
            if canonical:
                return canonical

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url
Beispiel #2
0
def _bypass_bitly_warning(url):
    """
    Sometime bitly blocks unshorten attempts, this bypasses that.
    """
    html_string = network.get(url)
    soup = BeautifulSoup(html_string)
    a = soup.find('a', {'id': 'clickthrough'})
    if a:
        return a.attrs.get('href')
    return url
Beispiel #3
0
def _bypass_bitly_warning(url):
    """
    Sometime bitly blocks unshorten attempts, this bypasses that.
    """
    html_string = network.get(url)
    soup = BeautifulSoup(html_string)
    a = soup.find('a', {'id': 'clickthrough'})
    if a:
        return a.attrs.get('href')
    return url
Beispiel #4
0
def prepare(url, source=None, canonicalize=True, expand=True, keep_params=('id', 'p', 'v')):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    # reconcile embeds:
    url = reconcile_embed(url)

    # check for redirects / non absolute urls
    if source:
        source_domain = get_domain(source)

        # reconcile redirects
        url = redirect_back(url, source_domain)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = BeautifulSoup(page_html)
            canonical = meta.canonical_url(soup)
            if canonical:
                return canonical

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url
Beispiel #5
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(
            source_url, source=source_url, canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # embed videos
    if url.is_video(canonical_url):
        data['body'] = embed.video(canonical_url)
        return data

    # extract article body
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)
    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Beispiel #6
0
 def fetch(self, params):
     text = network.get(self.endpoint, **params)
     if not text:
         return None
     return self._parse_jsonp(text)
Beispiel #7
0
def prepare(url, source=None, canonicalize=True, expand=True, keep_params=KEEP_PARAMS):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    if not url or url == "":
        return None

    # encode.
    url = url.encode('utf-8', errors='ignore')

    # reconcile embeds:
    url = reconcile_embed(url)

    # reconcile redirects
    url = redirect_back(url, source)

    # check for non absolute urls.
    if source:
        source_domain = get_domain(source)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check for missing scheme
    if not get_scheme(url):
        url = "http://" + url

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = make_soup(page_html)
            _url = meta.canonical_url(soup)
            if _url:
                url = _url

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url
Beispiel #8
0
def extract(source_url):
    """
    Article extraction. Method is as follows:
    1. Get html from url.
    2. Canonicalize URL.
    3. If not canonical, prepare the url.
    4. Extract meta tags.
    5. If embedly is active, use it for content extraction.
    6. If embedly doesnt return content or is not active, use readability
    7. If readability doesnt return content, use article tag.
    8. If authors aren't detcted from meta tags, detect them in article body.
    """

    # fetch page
    page_html = network.get(source_url)

    # something failed.
    if not page_html:
        log.warning("Failed to extract html from {}".format(source_url))
        return None

    soup = BeautifulSoup(page_html)

    # get canonical url
    canonical_url = meta.canonical_url(soup)
    if not canonical_url:
        canonical_url = url.prepare(source_url,
                                    source=source_url,
                                    canonicalize=False)

    # domain
    domain = url.get_domain(canonical_url)

    # get meta tags + other data
    data = {
        'url': canonical_url,
        'domain': domain,
        'title': meta.title(soup, canonical_url),
        'description': meta.description(soup, canonical_url),
        'img_url': meta.img_url(soup, canonical_url),
        'created': meta.publish_date(soup, canonical_url),
        'favicon': meta.favicon(soup, canonical_url),
        'site_name': meta.site_name(soup, canonical_url),
        'page_type': meta.page_type(soup, canonical_url),
        'authors': author.extract(soup),
        'body': None
    }

    # extract body from embedly + readability
    if settings.EMBEDLY_ENABLED:
        data['body'] = body_via_embedly(canonical_url)

    if not data['body']:
        data['body'] = body_via_readability(page_html, canonical_url)

    # # extract body from article tag
    body, raw_html = body_via_article_tag(soup, canonical_url)

    # merge body
    if not data['body']:
        data['body'] = body

    # get creators from raw article html
    if not len(data['authors']) and raw_html:
        data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS)

        # remove site name from authors
        if data.get('site_name'):
            data['authors'] = [
                a.replace(data['site_name'].upper(), "").strip()
                for a in data['authors']
            ]

    # # get links from raw_html + content
    # links = [u for u in url.from_any(data['body']) if source_url not in u]
    # for u in url.from_any(raw_html, source=source_url):
    #     if u not in links and (u != source_url or not u.startswith(source_url)):
    #         links.append(u)

    # # split out internal / external links / article links
    # data['links'] = url.categorize_links(links, domain)

    return data
Beispiel #9
0
def prepare(url,
            source=None,
            canonicalize=True,
            expand=True,
            keep_params=KEEP_PARAMS):
    """
    Operations that unshorten a url, reconcile embeds,
    resolves redirects, strip parameters (with optional
    ones to keep), and then attempts to canonicalize the url
    by checking the page source's metadata.

    All urls that enter `merlynne` are first treated with this function.
    """
    if not url or url == "":
        return None

    # encode.
    url = url.encode('utf-8', errors='ignore')

    # reconcile embeds:
    url = reconcile_embed(url)

    # reconcile redirects
    url = redirect_back(url, source)

    # check for non absolute urls.
    if source:
        source_domain = get_domain(source)

        # if the domain is in the source, attempt to absolutify it
        if source_domain in url:

            # check for non-absolute urls
            if not is_abs(url):
                url = urljoin(source, url)

    # check for missing scheme
    if not get_scheme(url):
        url = "http://" + url

    # check short urls
    if expand:
        if is_shortened(url):
            url = unshorten(url, attempts=1)

    # canonicalize
    if canonicalize:
        page_html = network.get(url)
        if page_html:
            soup = make_soup(page_html)
            _url = meta.canonical_url(soup)
            if _url:
                url = _url

    # if it got converted to None, return
    if not url:
        return None

    # remove arguments w/ optional parameters to keep.
    url = remove_args(url, keep_params)

    # remove index.html
    url = re_index_html.sub('', url)

    # always remove trailing slash
    if url.endswith('/'):
        url = url[:-1]
    return url