Esempio n. 1
0
def sitemap_tree_for_homepage(homepage_url: str) -> AbstractSitemap:
    """Using a homepage URL, fetch the tree of sitemaps and its stories."""

    if not is_http_url(homepage_url):
        raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(homepage_url))

    try:
        url = normalize_url(homepage_url)
    except Exception as ex:
        raise McSitemapsException("Unable to normalize URL {}: {}".format(homepage_url, ex))

    try:
        uri = furl(url)
    except Exception as ex:
        raise McSitemapsException("Unable to parse URL {}: {}".format(url, ex))

    if not is_homepage_url(homepage_url):
        try:
            uri = uri.remove(path=True, query=True, query_params=True, fragment=True)
            log.warning("Assuming that the homepage of {} is {}".format(homepage_url, uri.url))
        except Exception as ex:
            raise McSitemapsException("Unable to determine homepage URL for URL {}: {}".format(homepage_url, ex))

    uri.path = '/robots.txt'
    robots_txt_url = str(uri.url)

    robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, recursion_level=0)
    sitemap_tree = robots_txt_fetcher.sitemap()
    return sitemap_tree
Esempio n. 2
0
    def guid_if_valid(self) -> Optional[str]:
        """
        Return item GUID (unique identifier) if it appears to actually be unique.

        Some GUIDs are not in fact unique. Return the GUID if it looks valid or None if the GUID looks like it is not
        unique.
        """

        guid = self.guid()

        if guid:
            # Ignore it if it is a homepage URL
            if is_homepage_url(guid):
                guid = None
        else:
            # Might have been an empty string
            guid = None

        return guid
Esempio n. 3
0
def test_is_homepage_url():
    # Bad input
    # noinspection PyTypeChecker
    assert not mc_url.is_homepage_url(None)
    assert not mc_url.is_homepage_url('')

    # No scheme
    assert not mc_url.is_homepage_url('abc')

    # True positives
    assert mc_url.is_homepage_url('http://www.wired.com')
    assert mc_url.is_homepage_url('http://www.wired.com/')
    assert mc_url.is_homepage_url('http://m.wired.com/#abc')

    # False negatives
    assert not mc_url.is_homepage_url(
        'http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/')

    # DELFI article (article identifier as query parameter)
    assert not mc_url.is_homepage_url(
        'http://www.delfi.lt/news/daily/world/prancuzijoje-tukstanciai-pareigunu-sukuoja-apylinkes-blokuojami-'
        'keliai.d?id=66850094')

    # Bash.org quote (empty path, article identifier as query parameter)
    assert not mc_url.is_homepage_url('http://bash.org/?244321')

    # YouTube shortened URL (path consists of letters with varying cases)
    assert not mc_url.is_homepage_url('http://youtu.be/oKyFAMiZMbU')

    # Bit.ly shortened URL (path has a number)
    assert not mc_url.is_homepage_url('https://bit.ly/1uSjCJp')

    # Bit.ly shortened URL (path does not have a number, but the host is in the URL shorteners list)
    assert not mc_url.is_homepage_url('https://bit.ly/defghi')

    # Link to JPG
    assert not mc_url.is_homepage_url('https://i.imgur.com/gbu5YNM.jpg')

    # Technically, server is not required to normalize "///" path into "/", but most of them do anyway
    assert mc_url.is_homepage_url('http://www.wired.com///')
    assert mc_url.is_homepage_url('http://m.wired.com///')

    # Smarter homepage identification ("/en/", "/news/", ...)
    assert mc_url.is_homepage_url('http://www.latimes.com/entertainment/')
    assert mc_url.is_homepage_url('http://www.scidev.net/global/')
    assert mc_url.is_homepage_url('http://abcnews.go.com/US')
    assert mc_url.is_homepage_url('http://www.example.com/news/')
    assert mc_url.is_homepage_url('http://www.france24.com/en/')
    assert mc_url.is_homepage_url(
        'http://www.france24.com/en/?altcast_code=0adb03a8a4')
    assert mc_url.is_homepage_url('http://www.google.com/trends/explore')
    assert mc_url.is_homepage_url(
        'http://www.google.com/trends/explore#q=Ebola')
    assert mc_url.is_homepage_url('http://www.nytimes.com/pages/todayspaper/')
    assert mc_url.is_homepage_url('http://www.politico.com/playbook/')
Esempio n. 4
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Esempio n. 5
0
def test_is_homepage_url():
    # Bad input
    # noinspection PyTypeChecker
    assert not mc_url.is_homepage_url(None)
    assert not mc_url.is_homepage_url('')

    # No scheme
    assert not mc_url.is_homepage_url('abc')

    # True positives
    assert mc_url.is_homepage_url('http://www.wired.com')
    assert mc_url.is_homepage_url('http://www.wired.com/')
    assert mc_url.is_homepage_url('http://m.wired.com/#abc')

    # False negatives
    assert not mc_url.is_homepage_url('http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/')

    # DELFI article (article identifier as query parameter)
    assert not mc_url.is_homepage_url(
        'http://www.delfi.lt/news/daily/world/prancuzijoje-tukstanciai-pareigunu-sukuoja-apylinkes-blokuojami-'
        'keliai.d?id=66850094'
    )

    # Bash.org quote (empty path, article identifier as query parameter)
    assert not mc_url.is_homepage_url('http://bash.org/?244321')

    # YouTube shortened URL (path consists of letters with varying cases)
    assert not mc_url.is_homepage_url('http://youtu.be/oKyFAMiZMbU')

    # Bit.ly shortened URL (path has a number)
    assert not mc_url.is_homepage_url('https://bit.ly/1uSjCJp')

    # Bit.ly shortened URL (path does not have a number, but the host is in the URL shorteners list)
    assert not mc_url.is_homepage_url('https://bit.ly/defghi')

    # Link to JPG
    assert not mc_url.is_homepage_url('https://i.imgur.com/gbu5YNM.jpg')

    # Technically, server is not required to normalize "///" path into "/", but most of them do anyway
    assert mc_url.is_homepage_url('http://www.wired.com///')
    assert mc_url.is_homepage_url('http://m.wired.com///')

    # Smarter homepage identification ("/en/", "/news/", ...)
    assert mc_url.is_homepage_url('http://www.latimes.com/entertainment/')
    assert mc_url.is_homepage_url('http://www.scidev.net/global/')
    assert mc_url.is_homepage_url('http://abcnews.go.com/US')
    assert mc_url.is_homepage_url('http://www.example.com/news/')
    assert mc_url.is_homepage_url('http://www.france24.com/en/')
    assert mc_url.is_homepage_url('http://www.france24.com/en/?altcast_code=0adb03a8a4')
    assert mc_url.is_homepage_url('http://www.google.com/trends/explore')
    assert mc_url.is_homepage_url('http://www.google.com/trends/explore#q=Ebola')
    assert mc_url.is_homepage_url('http://www.nytimes.com/pages/todayspaper/')
    assert mc_url.is_homepage_url('http://www.politico.com/playbook/')
Esempio n. 6
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls