Ejemplo n.º 1
0
def sitemap_tree_for_homepage(homepage_url: str) -> AbstractSitemap:
    """Using a homepage URL, fetch the tree of sitemaps and its stories."""

    if not is_http_url(homepage_url):
        raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(homepage_url))

    try:
        url = normalize_url(homepage_url)
    except Exception as ex:
        raise McSitemapsException("Unable to normalize URL {}: {}".format(homepage_url, ex))

    try:
        uri = furl(url)
    except Exception as ex:
        raise McSitemapsException("Unable to parse URL {}: {}".format(url, ex))

    if not is_homepage_url(homepage_url):
        try:
            uri = uri.remove(path=True, query=True, query_params=True, fragment=True)
            log.warning("Assuming that the homepage of {} is {}".format(homepage_url, uri.url))
        except Exception as ex:
            raise McSitemapsException("Unable to determine homepage URL for URL {}: {}".format(homepage_url, ex))

    uri.path = '/robots.txt'
    robots_txt_url = str(uri.url)

    robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, recursion_level=0)
    sitemap_tree = robots_txt_fetcher.sitemap()
    return sitemap_tree
Ejemplo n.º 2
0
    def __init__(self,
                 url: str,
                 recursion_level: int,
                 ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException(
                "Recursion level exceeded {} for URL {}.".format(
                    self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException(
                "URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(
                url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Ejemplo n.º 3
0
    def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Ejemplo n.º 4
0
def test_normalize_url():
    # Bad URLs
    with pytest.raises(mc_url.McNormalizeURLException):
        # noinspection PyTypeChecker
        mc_url.normalize_url(None)
    with pytest.raises(mc_url.McNormalizeURLException):
        mc_url.normalize_url('gopher://gopher.floodgap.com/0/v2/vstat')

    # Basic
    # (No urls_are_equal() because we want to compare them as strings here)
    assert mc_url.normalize_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244'
                                ) == 'http://cyber.law.harvard.edu/node/9244'
    assert mc_url.normalize_url(
        'HTTP://WWW.GOCRICKET.COM/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself'
        '-to-survive/articleshow_sg/40421328.cms?utm_source=facebook.com&utm_medium=referral'
    ) == 'http://www.gocricket.com/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself-to-' \
         'survive/articleshow_sg/40421328.cms'

    # Multiple fragments
    assert mc_url.normalize_url(
        'HTTP://CYBER.LAW.HARVARD.EDU/node/9244#foo#bar'
    ) == 'http://cyber.law.harvard.edu/node/9244'

    # URL in query
    assert mc_url.normalize_url(
        'http://bash.org/?244321') == 'http://bash.org/?244321'

    # Broken URL
    assert mc_url.normalize_url('http://http://www.al-monitor.com/pulse'
                                ) == 'http://www.al-monitor.com/pulse'

    # Empty parameter
    assert mc_url.normalize_url(
        'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6'
    ) == 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html'

    # Remove whitespace
    assert mc_url.normalize_url(
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  '
    ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html'
    assert mc_url.normalize_url(
        "\t\thttp://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html\t\t"
    ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html'

    # NYTimes
    assert mc_url.normalize_url(
        'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/'
        '?smid=fb-nytimes&WT.z_sma=BU_WID_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000'
        '&bicmet=1420088400000&_'
    ) == 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/'
    assert mc_url.normalize_url(
        'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html?smid=fb-nytimes&'
        'WT.z_sma=UP_IOA_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&bicmet=1420088400000&_r=1&'
        'abt=0002&abg=1'
    ) == 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html'
    assert mc_url.normalize_url(
        'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html'
        '?smid=fb-nytimes&WT.z_sma=UP_DOT_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&'
        'bicmet=1420088400000&_r=1&abt=0002&abg=1'
    ) == 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html'

    # Facebook
    assert mc_url.normalize_url(
        'https://www.facebook.com/BerkmanCenter?ref=br_tf'
    ) == 'https://www.facebook.com/BerkmanCenter'

    # LiveJournal
    assert mc_url.normalize_url(
        'http://zyalt.livejournal.com/1178735.html?thread=396696687#t396696687'
    ) == 'http://zyalt.livejournal.com/1178735.html'

    # "nk" parameter
    assert mc_url.normalize_url(
        'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser'
        '-shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-'
        '1227184460050?nk=440cd48fd95a4e1f1c23bcd15df36da7'
    ) == (
        'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser-'
        'shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-'
        '1227184460050')
Ejemplo n.º 5
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Ejemplo n.º 6
0
def test_normalize_url():
    # Bad URLs
    with pytest.raises(mc_url.McNormalizeURLException):
        # noinspection PyTypeChecker
        mc_url.normalize_url(None)
    with pytest.raises(mc_url.McNormalizeURLException):
        mc_url.normalize_url('gopher://gopher.floodgap.com/0/v2/vstat')

    # Basic
    # (No urls_are_equal() because we want to compare them as strings here)
    assert mc_url.normalize_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244') == 'http://cyber.law.harvard.edu/node/9244'
    assert mc_url.normalize_url(
        'HTTP://WWW.GOCRICKET.COM/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself'
        '-to-survive/articleshow_sg/40421328.cms?utm_source=facebook.com&utm_medium=referral'
    ) == 'http://www.gocricket.com/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself-to-' \
         'survive/articleshow_sg/40421328.cms'

    # Multiple fragments
    assert mc_url.normalize_url(
        'HTTP://CYBER.LAW.HARVARD.EDU/node/9244#foo#bar'
    ) == 'http://cyber.law.harvard.edu/node/9244'

    # URL in query
    assert mc_url.normalize_url('http://bash.org/?244321') == 'http://bash.org/?244321'

    # Broken URL
    assert mc_url.normalize_url('http://http://www.al-monitor.com/pulse') == 'http://www.al-monitor.com/pulse'

    # Empty parameter
    assert mc_url.normalize_url(
        'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6'
    ) == 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html'

    # Remove whitespace
    assert mc_url.normalize_url(
        '  http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html  '
    ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html'
    assert mc_url.normalize_url(
        "\t\thttp://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html\t\t"
    ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html'

    # NYTimes
    assert mc_url.normalize_url(
        'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/'
        '?smid=fb-nytimes&WT.z_sma=BU_WID_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000'
        '&bicmet=1420088400000&_'
    ) == 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/'
    assert mc_url.normalize_url(
        'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html?smid=fb-nytimes&'
        'WT.z_sma=UP_IOA_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&bicmet=1420088400000&_r=1&'
        'abt=0002&abg=1'
    ) == 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html'
    assert mc_url.normalize_url(
        'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html'
        '?smid=fb-nytimes&WT.z_sma=UP_DOT_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&'
        'bicmet=1420088400000&_r=1&abt=0002&abg=1'
    ) == 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html'

    # Facebook
    assert mc_url.normalize_url(
        'https://www.facebook.com/BerkmanCenter?ref=br_tf') == 'https://www.facebook.com/BerkmanCenter'

    # LiveJournal
    assert mc_url.normalize_url(
        'http://zyalt.livejournal.com/1178735.html?thread=396696687#t396696687'
    ) == 'http://zyalt.livejournal.com/1178735.html'

    # "nk" parameter
    assert mc_url.normalize_url(
        'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser'
        '-shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-'
        '1227184460050?nk=440cd48fd95a4e1f1c23bcd15df36da7'
    ) == ('http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser-'
          'shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-'
          '1227184460050')
Ejemplo n.º 7
0
        def page(self) -> Optional[SitemapPage]:
            """Return constructed sitemap page if one has been completed, otherwise None."""

            # Required
            url = html_unescape_strip(self.url)
            if not url:
                log.error("URL is unset")
                return None

            try:
                url = normalize_url(url)
            except Exception as ex:
                log.error("Unable to normalize URL {}: {}".format(url, ex))
                return None

            last_modified = html_unescape_strip(self.last_modified)
            if last_modified:
                last_modified = parse_sitemap_publication_date(last_modified)

            change_frequency = html_unescape_strip(self.change_frequency)
            if change_frequency:
                change_frequency = SitemapPageChangeFrequency(
                    change_frequency.lower())
                assert isinstance(change_frequency, SitemapPageChangeFrequency)

            priority = html_unescape_strip(self.priority)
            if priority:
                priority = Decimal(priority)

                comp_zero = priority.compare(Decimal('0.0'))
                comp_one = priority.compare(Decimal('1.0'))
                if comp_zero in (Decimal('0'), Decimal('1') and comp_one
                                 in (Decimal('0'), Decimal('-1'))):
                    # 0 <= priority <= 1
                    pass
                else:
                    log.warning(
                        "Priority is not within 0 and 1: {}".format(priority))
                    priority = SITEMAP_PAGE_DEFAULT_PRIORITY

            else:
                priority = SITEMAP_PAGE_DEFAULT_PRIORITY

            news_title = html_unescape_strip(self.news_title)

            news_publish_date = html_unescape_strip(self.news_publish_date)
            if news_publish_date:
                news_publish_date = parse_sitemap_publication_date(
                    date_string=news_publish_date)

            news_publication_name = html_unescape_strip(
                self.news_publication_name)
            news_publication_language = html_unescape_strip(
                self.news_publication_language)
            news_access = html_unescape_strip(self.news_access)

            news_genres = html_unescape_strip(self.news_genres)
            if news_genres:
                news_genres = [x.strip() for x in news_genres.split(',')]
            else:
                news_genres = []

            news_keywords = html_unescape_strip(self.news_keywords)
            if news_keywords:
                news_keywords = [x.strip() for x in news_keywords.split(',')]
            else:
                news_keywords = []

            news_stock_tickers = html_unescape_strip(self.news_stock_tickers)
            if news_stock_tickers:
                news_stock_tickers = [
                    x.strip() for x in news_stock_tickers.split(',')
                ]
            else:
                news_stock_tickers = []

            sitemap_news_story = None
            if news_title and news_publish_date:
                sitemap_news_story = SitemapNewsStory(
                    title=news_title,
                    publish_date=news_publish_date,
                    publication_name=news_publication_name,
                    publication_language=news_publication_language,
                    access=news_access,
                    genres=news_genres,
                    keywords=news_keywords,
                    stock_tickers=news_stock_tickers,
                )

            return SitemapPage(
                url=url,
                last_modified=last_modified,
                change_frequency=change_frequency,
                priority=priority,
                news_story=sitemap_news_story,
            )
Ejemplo n.º 8
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls
Ejemplo n.º 9
0
        def page(self) -> Optional[SitemapPage]:
            """Return constructed sitemap page if one has been completed, otherwise None."""

            # Required
            url = html_unescape_strip(self.url)
            if not url:
                log.error("URL is unset")
                return None

            try:
                url = normalize_url(url)
            except Exception as ex:
                log.error("Unable to normalize URL {}: {}".format(url, ex))
                return None

            last_modified = html_unescape_strip(self.last_modified)
            if last_modified:
                last_modified = parse_sitemap_publication_date(last_modified)

            change_frequency = html_unescape_strip(self.change_frequency)
            if change_frequency:
                change_frequency = SitemapPageChangeFrequency(change_frequency.lower())
                assert isinstance(change_frequency, SitemapPageChangeFrequency)

            priority = html_unescape_strip(self.priority)
            if priority:
                priority = Decimal(priority)

                comp_zero = priority.compare(Decimal('0.0'))
                comp_one = priority.compare(Decimal('1.0'))
                if comp_zero in (Decimal('0'), Decimal('1') and comp_one in (Decimal('0'), Decimal('-1'))):
                    # 0 <= priority <= 1
                    pass
                else:
                    log.warning("Priority is not within 0 and 1: {}".format(priority))
                    priority = SITEMAP_PAGE_DEFAULT_PRIORITY

            else:
                priority = SITEMAP_PAGE_DEFAULT_PRIORITY

            news_title = html_unescape_strip(self.news_title)

            news_publish_date = html_unescape_strip(self.news_publish_date)
            if news_publish_date:
                news_publish_date = parse_sitemap_publication_date(date_string=news_publish_date)

            news_publication_name = html_unescape_strip(self.news_publication_name)
            news_publication_language = html_unescape_strip(self.news_publication_language)
            news_access = html_unescape_strip(self.news_access)

            news_genres = html_unescape_strip(self.news_genres)
            if news_genres:
                news_genres = [x.strip() for x in news_genres.split(',')]
            else:
                news_genres = []

            news_keywords = html_unescape_strip(self.news_keywords)
            if news_keywords:
                news_keywords = [x.strip() for x in news_keywords.split(',')]
            else:
                news_keywords = []

            news_stock_tickers = html_unescape_strip(self.news_stock_tickers)
            if news_stock_tickers:
                news_stock_tickers = [x.strip() for x in news_stock_tickers.split(',')]
            else:
                news_stock_tickers = []

            sitemap_news_story = None
            if news_title and news_publish_date:
                sitemap_news_story = SitemapNewsStory(
                    title=news_title,
                    publish_date=news_publish_date,
                    publication_name=news_publication_name,
                    publication_language=news_publication_language,
                    access=news_access,
                    genres=news_genres,
                    keywords=news_keywords,
                    stock_tickers=news_stock_tickers,
                )

            return SitemapPage(
                url=url,
                last_modified=last_modified,
                change_frequency=change_frequency,
                priority=priority,
                news_story=sitemap_news_story,
            )
Ejemplo n.º 10
0
def __normalize_media_url(output_dir: str, media_id: str, url: str, queue: multiprocessing.Queue) -> None:
    if is_http_url(url):
        normalized_url = normalize_url(url)
        output_file = os.path.join(output_dir, media_id)
        queue.put((output_file, normalized_url,))