Beispiel #1
0
def _country_tld_from_url(url: str) -> Optional[str]:
    """
    Extract country TLD from URL; it's URL looks weird, don't sweat about it.

    :param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml".
    :return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD.
    """
    if not url:
        return None

    url = fix_common_url_mistakes(url)

    try:
        url = canonical_url(url)
    except Exception as ex:
        log.error(f"Unable to get canonical URL from URL {url}: {ex}")
        return None

    try:
        parsed_url = urlparse(url)
    except Exception as ex:
        log.warning(f"Unable to parse URL {url}: {ex}")
        return None

    hostname_parts = parsed_url.hostname.split('.')

    if len(hostname_parts) < 2:
        log.warning(f"No TLD found in URL {url}")
        return None

    return hostname_parts[-1].lower()
Beispiel #2
0
def test_canonical_url():
    # Bad input
    with pytest.raises(mc_url.McCanonicalURLException):
        # noinspection PyTypeChecker
        mc_url.canonical_url(None)

    with pytest.raises(mc_url.McCanonicalURLException):
        # noinspection PyTypeChecker
        mc_url.canonical_url('')

    # Invalid URL
    with pytest.raises(mc_url.McCanonicalURLException):
        funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20'
                     'doubts%20about%20safety%20of%20live%20entertainment')
        mc_url.canonical_url(funky_url)

    # No urls_are_equal() because we want to compare them as strings here
    assert mc_url.canonical_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244') == 'http://cyber.law.harvard.edu/node/9244'
Beispiel #3
0
def test_canonical_url():
    # Bad input
    with pytest.raises(mc_url.McCanonicalURLException):
        # noinspection PyTypeChecker
        mc_url.canonical_url(None)

    with pytest.raises(mc_url.McCanonicalURLException):
        # noinspection PyTypeChecker
        mc_url.canonical_url('')

    # Invalid URL
    with pytest.raises(mc_url.McCanonicalURLException):
        funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises%20new%20'
                     'doubts%20about%20safety%20of%20live%20entertainment')
        mc_url.canonical_url(funky_url)

    # No urls_are_equal() because we want to compare them as strings here
    assert mc_url.canonical_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244') == 'http://cyber.law.harvard.edu/node/9244'
Beispiel #4
0
def _get_url_stats(url: str, config: Optional[FacebookConfig] = None) -> FacebookURLStats:
    """
    Get Facebook statistics for an URL.

    Return URL stats on success, throw an exception on failure.

    :param url: URL to fetch the stats for.
    :param config: (optional) Facebook configuration object.
    :return FacebookURLStats object, or None if stats for this URL couldn't be fetched.
    """
    url = decode_object_from_bytes_if_needed(url)

    if not url:
        # Treat unset URLs as a soft failure
        raise McFacebookInvalidURLException(url=url, error_message="URL is not set.")

    url = fix_common_url_mistakes(url)

    if not is_http_url(url):
        log.error(f": {url}")
        raise McFacebookInvalidURLException(url=url, error_message="URL is not HTTP(s).")

    try:
        url = canonical_url(url)
    except Exception as ex:
        raise McFacebookInvalidURLException(url=url, error_message=f"Unable to canonicalize URL: {ex}")

    for pattern in __URL_PATTERNS_WHICH_WONT_WORK:
        if re.search(pattern, url):
            raise McFacebookInvalidURLException(
                url=url,
                error_message=f"URL matches one of the patterns for URLs that won't work against Facebook API.",
            )

    if not config:
        config = FacebookConfig()

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    # Make API request (https://developers.facebook.com/docs/graph-api/reference/v5.0/url)
    try:
        data = _api_request(
            node='',
            params={
                'id': url,
                'fields': 'engagement',
            },
            config=config,
        )
    except McFacebookException as ex:
        # Pass the known exception back to the caller for them to deal with
        log.error(f"Unable to fetch stats for URL '{url}': {ex}")
        raise ex

    except Exception as ex:
        # If an unknown exception was raised while making an API call, consider it a fatal error
        raise McFacebookErrorAPIResponseException(
            f"Unknown error happened while fetching stats for URL '{url}': {ex}"
        )

    if 'error' in data:
        log.error(f"Facebook API responded with error while fetching stats for URL '{url}': {data}")

        error = data['error']
        error_type = error.get('type', 'unknown type')
        error_message = error.get('message', 'unknown message')

        if error_type == 'GraphMethodException' and 'Unsupported get request' in error_message:
            # Non-fatal permissions error for this specific URL
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        elif error_type == 'OAuthException' and error_message == 'An unknown error has occurred.':
            # some urls consistently return this error.  true permissions errors don't return 'unknown error' message.
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        elif error_type == 'OAuthException' and 'facebook.com' in error_message:
            # facebook urls require permissions we don't have
            raise McFacebookInvalidURLException(url=url, error_message=error_message)
        else:
            # Everything else is considered a fatal error by us as we don't know what exactly happened
            raise McFacebookErrorAPIResponseException(
                f"Error response while fetching stats for URL '{url}': {error_type} {error_message}"
            )

    response_url = data.get('id', None)
    if response_url is None:
        # Facebook API is expected to always return URL that we got the stats for
        raise McFacebookUnexpectedAPIResponseException(
            response=data,
            error_message="Response doesn't have 'id' key",
        )

    response_url = str(response_url)

    # Facebook API returns a numeric ID for a URL that's a Facebook page
    if not response_url.isdigit():

        # Verify that we got stats for the right URL
        # FIXME for whatever reason 'url' does get un-canonicalized at this point
        if response_url != url and canonical_url(response_url) != canonical_url(url):
            raise McFacebookUnexpectedAPIResponseException(
                response=data,
                error_message=f"Response URL ({response_url}) is not the same as request URL ({url})",
            )

    engagement = data.get('engagement', None)
    if engagement is None:
        # We expect 'engagement' to be at least set to an empty dict
        raise McFacebookUnexpectedAPIResponseException(
            response=data,
            error_message="Response doesn't have 'engagement' key",
        )

    # While 'engagement' is expected to always be set, all URL stats are not required to be present because Facebook
    # might not have ever seen this URL before
    stats = FacebookURLStats(
        share_count=engagement.get('share_count', None),
        comment_count=engagement.get('comment_count', None),
        reaction_count=engagement.get('reaction_count', None),
    )

    # If none of the stats are set, just return None
    if stats.share_count is None and stats.comment_count is None and stats.reaction_count is None:
        raise McFacebookInvalidURLException(url=url, error_message="No statistics were returned for URL.")

    log.debug(f"Facebook statistics for URL '{url}': {stats}")

    return stats