Esempio n. 1
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Esempio n. 2
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Esempio n. 3
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Esempio n. 4
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls