Example #1
0
def _get_story_url_variants(story: dict) -> list:
    """Return a list of the unique set of the story url and guid and their normalize_url_lossy() versions."""
    urls = list({
        story['url'],
        normalize_url_lossy(story['url']), story['guid'],
        normalize_url_lossy(story['guid'])
    })

    return urls
Example #2
0
def test_normalize_url_lossy():
    # FIXME - some resulting URLs look funny, not sure if I can change them easily though
    # (No urls_are_equal() because we want to compare them as strings here)
    assert mc_url.normalize_url_lossy(
        'HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy(
        'http://HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy('http://http://www.al-monitor.com/pulse') == 'http://al-monitor.com/pulse'
    assert mc_url.normalize_url_lossy('http://m.delfi.lt/foo') == 'http://delfi.lt/foo'
    assert mc_url.normalize_url_lossy(
        'http://blog.yesmeck.com/jquery-jsonview/') == 'http://yesmeck.com/jquery-jsonview/'
    assert mc_url.normalize_url_lossy('http://cdn.com.do/noticias/nacionales') == 'http://com.do/noticias/nacionales'
    assert mc_url.normalize_url_lossy('http://543.r2.ly') == 'http://543.r2.ly/'

    tests = [
        ['http://nytimes.com', 'http://nytimes.com/'],
        ['http://http://nytimes.com', 'http://nytimes.com/'],
        ['HTTP://nytimes.COM', 'http://nytimes.com/'],
        ['http://beta.foo.com/bar', 'http://foo.com/bar'],
        ['http://archive.org/bar', 'http://archive.org/bar'],
        ['http://m.archive.org/bar', 'http://archive.org/bar'],
        ['http://archive.foo.com/bar', 'http://foo.com/bar'],
        ['http://foo.com/bar#baz', 'http://foo.com/bar'],
        ['http://foo.com/bar/baz//foo', 'http://foo.com/bar/baz/foo'],
        ['https://archive.is/o/vWkgm/www.huffingtonpost.com/lisa-bloom/why-the-new-child-rape-ca_b_10619944.html',
         'http://huffingtonpost.com/lisa-bloom/why-the-new-child-rape-ca_b_10619944.html'],
        ['https://archive.is/o/m1k2A/https://en.wikipedia.org/wiki/Gamergate_controversy%23cite_note-right_wing-130',
         'http://en.wikipedia.org/wiki/gamergate_controversy#cite_note-right_wing-130']
    ]

    for test in tests:
        input_url, expected_output_url = test
        assert mc_url.normalize_url_lossy(input_url) == expected_output_url
Example #3
0
def test_normalize_url_lossy():
    # FIXME - some resulting URLs look funny, not sure if I can change them easily though
    # (No urls_are_equal() because we want to compare them as strings here)
    assert mc_url.normalize_url_lossy(
        'HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy(
        'http://HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy('http://http://www.al-monitor.com/pulse') == 'http://al-monitor.com/pulse'
    assert mc_url.normalize_url_lossy('http://m.delfi.lt/foo') == 'http://delfi.lt/foo'
    assert mc_url.normalize_url_lossy(
        'http://blog.yesmeck.com/jquery-jsonview/') == 'http://yesmeck.com/jquery-jsonview/'
    assert mc_url.normalize_url_lossy('http://cdn.com.do/noticias/nacionales') == 'http://com.do/noticias/nacionales'
    assert mc_url.normalize_url_lossy('http://543.r2.ly') == 'http://543.r2.ly/'

    tests = [
        ['http://nytimes.com', 'http://nytimes.com/'],
        ['http://http://nytimes.com', 'http://nytimes.com/'],
        ['HTTP://nytimes.COM', 'http://nytimes.com/'],
        ['http://beta.foo.com/bar', 'http://foo.com/bar'],
        ['http://archive.org/bar', 'http://archive.org/bar'],
        ['http://m.archive.org/bar', 'http://archive.org/bar'],
        ['http://archive.foo.com/bar', 'http://foo.com/bar'],
        ['http://foo.com/bar#baz', 'http://foo.com/bar'],
        ['http://foo.com/bar/baz//foo', 'http://foo.com/bar/baz/foo'],
    ]

    for test in tests:
        input_url, expected_output_url = test
        assert mc_url.normalize_url_lossy(input_url) == expected_output_url
Example #4
0
def test_normalize_url_lossy():
    # FIXME - some resulting URLs look funny, not sure if I can change them easily though
    # (No urls_are_equal() because we want to compare them as strings here)
    assert mc_url.normalize_url_lossy(
        'HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy(
        'http://HTTP://WWW.nytimes.COM/ARTICLE/12345/?ab=cd#def#ghi/'
    ) == 'http://nytimes.com/article/12345/?ab=cd'
    assert mc_url.normalize_url_lossy('http://http://www.al-monitor.com/pulse') == 'http://al-monitor.com/pulse'
    assert mc_url.normalize_url_lossy('http://m.delfi.lt/foo') == 'http://delfi.lt/foo'
    assert mc_url.normalize_url_lossy(
        'http://blog.yesmeck.com/jquery-jsonview/') == 'http://yesmeck.com/jquery-jsonview/'
    assert mc_url.normalize_url_lossy('http://cdn.com.do/noticias/nacionales') == 'http://com.do/noticias/nacionales'
    assert mc_url.normalize_url_lossy('http://543.r2.ly') == 'http://543.r2.ly/'

    tests = [
        ['http://nytimes.com', 'http://nytimes.com/'],
        ['http://http://nytimes.com', 'http://nytimes.com/'],
        ['HTTP://nytimes.COM', 'http://nytimes.com/'],
        ['http://beta.foo.com/bar', 'http://foo.com/bar'],
        ['http://archive.org/bar', 'http://archive.org/bar'],
        ['http://m.archive.org/bar', 'http://archive.org/bar'],
        ['http://archive.foo.com/bar', 'http://foo.com/bar'],
        ['http://foo.com/bar#baz', 'http://foo.com/bar'],
        ['http://foo.com/bar/baz//foo', 'http://foo.com/bar/baz/foo'],
        ['https://archive.is/o/vWkgm/www.huffingtonpost.com/lisa-bloom/why-the-new-child-rape-ca_b_10619944.html',
         'http://huffingtonpost.com/lisa-bloom/why-the-new-child-rape-ca_b_10619944.html'],
        ['https://archive.is/o/m1k2A/https://en.wikipedia.org/wiki/Gamergate_controversy%23cite_note-right_wing-130',
         'http://en.wikipedia.org/wiki/gamergate_controversy#cite_note-right_wing-130']
    ]

    for test in tests:
        input_url, expected_output_url = test
        assert mc_url.normalize_url_lossy(input_url) == expected_output_url
Example #5
0
def _normalize_url(url: str) -> str:
    """Cap max length of url and run through normalize_url_lossy()."""
    normalized_url = normalize_url_lossy(url)
    if normalized_url is None:
        normalized_url = url

    return normalized_url[0:MAX_URL_LENGTH]
def test_youtube_urls():
    nyu = normalize_youtube_url

    assert nyu('http://foo.bar') == 'http://foo.bar'
    assert nyu(
        'http://youtube.com/foo/bar') == 'https://www.youtube.com/foo/bar'
    assert nyu(
        'https://youtube.com/foo/bar') == 'https://www.youtube.com/foo/bar'
    assert nyu('https://www.youtube.com/watch?v=123456'
               ) == 'https://www.youtube.com/watch?v=123456'
    assert nyu('https://www.youtube.com/watch?v=123456&foo=bar&share=bat'
               ) == 'https://www.youtube.com/watch?v=123456'
    assert nyu('https://www.youtube.com/channel/123456'
               ) == 'https://www.youtube.com/channel/123456'
    assert nyu('https://www.youtube.com/channel/123456?foo=bar'
               ) == 'https://www.youtube.com/channel/123456'
    assert nyu('https://www.youtube.com/user/123456'
               ) == 'https://www.youtube.com/user/123456'
    assert nyu('https://www.youtube.com/user/123456?foo=bar'
               ) == 'https://www.youtube.com/user/123456'
    assert nyu('https://www.youtube.com/embed/123456?foo=bar&share=bat'
               ) == 'https://www.youtube.com/watch?v=123456'

    assert normalize_url_lossy('https://www.youtube.com/embed/123456?f=b'
                               ) == 'http://youtube.com/watch?v=123456'
def _get_links_from_story(db: DatabaseHandler, story: dict) -> List[str]:
    """Extract and return linksk from the story.

    Extracts generates a deduped list of links from _get_links_from_html(), _get_links_from_story_text(),
    and _get_youtube_embed_links() for the given story.

    Arguments:
    db - db handle
    story - story dict from db

    Returns:
    list of urls

    """
    try:
        extracted_html = _get_extracted_html(db, story)

        html_links = _get_links_from_html(extracted_html)
        text_links = _get_links_from_story_text(db, story)
        youtube_links = _get_youtube_embed_links(db, story)

        all_links = html_links + text_links + youtube_links

        link_lookup = {}
        for url in filter(
                lambda x: re.search(IGNORE_LINK_PATTERN, x, flags=re.I) is
                None, all_links):
            link_lookup[normalize_url_lossy(url)] = url

        links = list(link_lookup.values())

        return links
    except (McAmazonS3StoreException, McDBIDownloadsException):
        # we expect the fetch_content() to fail occasionally
        return []
Example #8
0
def _ignore_link_pattern(url: Optional[str]) -> bool:
    """Return true if the url or redirect_url matches the ignore link pattern."""
    if url is None:
        return False

    p = IGNORE_LINK_PATTERN
    nu = normalize_url_lossy(url)

    return re2.search(p, url, re2.I) or re2.search(p, nu, re2.I)
def test_update_media_normalized_urls():
    """Test _update_media_normalized_urls()."""
    db = connect_to_db()

    [create_test_medium(db, str(i)) for i in range(5)]

    _update_media_normalized_urls(db)

    media = db.query("select * from media").hashes()
    for medium in media:
        expected_nu = normalize_url_lossy(medium['url'])
        assert (medium['url'] == expected_nu)
Example #10
0
def ignore_redirect(db: DatabaseHandler, url: str, redirect_url: Optional[str]) -> bool:
    """Return true if we should ignore redirects to the target media source.

    This is usually to avoid redirects to domain resellers for previously valid and important but now dead links."""
    if redirect_url is None or url == redirect_url:
        return False

    medium_url = generate_medium_url_and_name_from_url(redirect_url)[0]

    u = normalize_url_lossy(medium_url)

    match = db.query("select 1 from topic_ignore_redirects where url = %(a)s", {'a': u}).hash()

    return match is not None
Example #11
0
def _get_failed_url(db: DatabaseHandler, topics_id: int, url: str) -> Optional[dict]:
    """Return the links from the set without FETCH_STATE_REQUEST_FAILED or FETCH_STATE_CONTENT_MATCH_FAILED states.

    Arguments:
    db - db handle
    topic - topic dict from db
    urls - string urls

    Returns:

    a list of the topic_fetch_url dicts that do not have fetch fails
    """
    if isinstance(topics_id, bytes):
        topics_id = decode_object_from_bytes_if_needed(topics_id)

    topics_id = int(topics_id)
    url = decode_object_from_bytes_if_needed(url)

    urls = list({url, normalize_url_lossy(url)})

    # create temporary table first to make postgres do a topic_fetch_urls_url index scan followed
    # by a simple filter of those results
    db.query(
        """
        create temporary table _urls as
            select * from topic_fetch_urls where md5(url) = any(array(select md5(unnest(%(a)s))))
        """,
        {'a': urls})

    failed_url = db.query(
        """
        -- noinspection SqlResolve
        select *
            from _urls
            where
                topics_id = %(a)s and
                state in (%(b)s, %(c)s)
            limit 1
        """,
        {
            'a': topics_id,
            'b': FETCH_STATE_REQUEST_FAILED,
            'c': FETCH_STATE_CONTENT_MATCH_FAILED,
        }).hash()

    # noinspection SqlResolve
    db.query("drop table _urls")

    return failed_url
Example #12
0
    def get_domain(url):
        """Edit URL to make it appropriate for SimilarWeb API

        SimilarWeb only works on the netloc, with no www.

        Parameters
        ----------
        url : str
            URL to turn into a query for SimilarWeb

        Returns
        -------
            str: the domain of url
        """
        return normalize_url_lossy(urlparse(url).netloc)
Example #13
0
def insert_story_urls(db: DatabaseHandler, story: dict, url: str) -> None:
    """Insert the url and the normalize_url_lossy() version of the url into story_urls."""
    urls = (url, normalize_url_lossy(url))

    for url in set(urls):
        # wastefully query for existence of url because jumping straight into the on conflict do nothing
        # insert below sometimes results in a deadlock
        db.query(
            """
            insert into story_urls (stories_id, url)
                select %(a)s, %(b)s
                    where not exists ( select 1 from story_urls where stories_id = %(a)s and url = %(b)s )
                    on conflict (url, stories_id) do nothing
            """, {
                'a': story['stories_id'],
                'b': url
            })
Example #14
0
def insert_story_urls(db: DatabaseHandler, story: dict, url: str) -> None:
    """Insert the url and the normalize_url_lossy() version of the url into story_urls."""
    urls = (url, normalize_url_lossy(url))

    for url in set(urls):

        # FIXME some URLs are overly encoded, e.g.:
        #
        # http://dinamani.com/india/2020/feb/19/%E0%AE%85%E0%AE%AF%E0%AF%8B%E0
        # %AE%A4%E0%AF%8D%E0%AE%A4%E0%AE%BF%E0%AE%AF%E0%AE%BF%E0%AE%B2%E0%AF
        # %8D-%E0%AE%AA%E0%AE%BE%E0%AE%AA%E0%AE%BE%E0%AF%8D-%E0%AE%AE%E0%AE%9A
        # %E0%AF%82%E0%AE%A4%E0%AE%BF%E0%AE%AF%E0%AF%88-%E0%AE%9A%E0%AF%81%E0
        # %AE%B1%E0%AF%8D%E0%AE%B1%E0%AE%BF%E0%AE%AF%E0%AF%81%E0%AE%B3%E0%AF
        # %8D%E0%AE%B3-%E0%AE%AE%E0%AE%AF%E0%AE%BE%E0%AE%A9%E0%AE%A4%E0%AF%8D
        # %E0%AE%A4%E0%AF%88-%E0%AE%B5%E0%AE%BF%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0
        # %AF%81%E0%AE%B5%E0%AF%88%E0%AE%95%E0%AF%8D%E0%AE%95-%E0%AE%B5%E0%AF
        # %87%E0%AE%A3%E0%AF%8D%E0%AE%9F%E0%AF%81%E0%AE%AE%E0%AF%8D-%E0%AE%B0
        # %E0%AE%BE%E0%AE%AE%E0%AE%BE%E0%AF%8D-%E0%AE%95%E0%AF%8B%E0%AE%AF%E0
        # %AE%BF%E0%AE%B2%E0%AF%8D-%E0%AE%85%E0%AE%B1%E0%AE%95%E0%AF%8D%E0%AE
        # %95%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0%AE%B3%E0%AF%88%E0%AE%95%E0%AF%8D
        # %E0%AE%95%E0%AF%81-%E0%AE%AE%E0%AF%82%E0%AE%A4%E0%AF%8D%E0%AE%A4-
        # %E0%AE%B5%E0%AE%B4%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AF%81%E0%AE%B0%E0
        # %AF%88%E0%AE%9E%E0%AE%BE%E0%AF%8D-%E0%AE%95%E0%AE%9F%E0%AE%BF%E0%AE
        # %A4%E0%AE%AE%E0%AF%8D-3361308.html
        #
        # We might benefit from decoding the path in such URLs so that it fits
        # within 1024 characters, and perhaps more importantly, the
        # deduplication works better:
        #
        # http://dinamani.com/india/2020/feb/19/அயோத்தியில்-பாபா்-மசூதியை-சுற்றி
        # யுள்ள-மயானத்தை-விட்டுவைக்க-வேண்டும்-ராமா்-கோயில்-அறக்கட்டளைக்கு-மூத்த-வழ
        # க்குரைஞா்-கடிதம்-3361308.html
        if len(url) <= MAX_URL_LENGTH:

            # wastefully query for existence of url because jumping straight into the on conflict do nothing
            # insert below sometimes results in a deadlock
            db.query(
                """
                insert into story_urls (stories_id, url)
                    select %(a)s, %(b)s
                        where not exists ( select 1 from story_urls where stories_id = %(a)s and url = %(b)s )
                        on conflict (url, stories_id) do nothing
                """, {
                    'a': story['stories_id'],
                    'b': url
                })
Example #15
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(
            db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query(
        "select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'],
                        {'normalized_url': normalized_url})

    db.commit()
Example #16
0
def check_if_is_domain_exact_match(url: str, domain: str) -> bool:
    """See if the domain would likely resolve to the same place as the url.

    Note that this currently considers query parameters important, so
    'nytimes.com?ref=twitter.com' will be different from 'nytimes.com'

    Parameters
    ----------
    url : str
        Raw url, perhaps from a media source

    domain : str
        Cleaned domain, perhaps from SimilarWebClient.get_domain

    Returns
    -------
    bool, whether the two urls are likely the same.
    """
    return normalize_url_lossy(url) == domain
Example #17
0
def _get_failed_url(db: DatabaseHandler, topics_id: int, url: str) -> Optional[dict]:
    """Return the links from the set without FETCH_STATE_REQUEST_FAILED or FETCH_STATE_CONTENT_MATCH_FAILED states.

    Arguments:
    db - db handle
    topic - topic dict from db
    urls - string urls

    Returns:

    a list of the topic_fetch_url dicts that do not have fetch fails
    """
    if isinstance(topics_id, bytes):
        topics_id = decode_object_from_bytes_if_needed(topics_id)

    topics_id = int(topics_id)
    url = decode_object_from_bytes_if_needed(url)

    urls = list({url, normalize_url_lossy(url)})

    failed_url = db.query(
        """
            WITH _urls AS (
                SELECT *
                FROM topic_fetch_urls
                WHERE md5(url) = any(array(select md5(unnest(%(urls)s))))
            )
            SELECT *
            FROM _urls
            WHERE topics_id = %(topics_id)s
              AND state IN (%(fetch_state_1)s, %(fetch_state_2)s)
            LIMIT 1
        """,
        {
            'urls': urls,
            'topics_id': topics_id,
            'fetch_state_1': FETCH_STATE_REQUEST_FAILED,
            'fetch_state_2': FETCH_STATE_CONTENT_MATCH_FAILED,
        }
    ).hash()

    return failed_url
Example #18
0
def insert_story_urls(db: DatabaseHandler, story: dict, url: str) -> None:
    """Insert the url and the normalize_url_lossy() version of the url into story_urls."""
    urls = (url, normalize_url_lossy(url))

    for url in set(urls):

        # FIXME some URLs are overly encoded, e.g.:
        #
        # http://dinamani.com/india/2020/feb/19/%E0%AE%85%E0%AE%AF%E0%AF%8B%E0
        # %AE%A4%E0%AF%8D%E0%AE%A4%E0%AE%BF%E0%AE%AF%E0%AE%BF%E0%AE%B2%E0%AF
        # %8D-%E0%AE%AA%E0%AE%BE%E0%AE%AA%E0%AE%BE%E0%AF%8D-%E0%AE%AE%E0%AE%9A
        # %E0%AF%82%E0%AE%A4%E0%AE%BF%E0%AE%AF%E0%AF%88-%E0%AE%9A%E0%AF%81%E0
        # %AE%B1%E0%AF%8D%E0%AE%B1%E0%AE%BF%E0%AE%AF%E0%AF%81%E0%AE%B3%E0%AF
        # %8D%E0%AE%B3-%E0%AE%AE%E0%AE%AF%E0%AE%BE%E0%AE%A9%E0%AE%A4%E0%AF%8D
        # %E0%AE%A4%E0%AF%88-%E0%AE%B5%E0%AE%BF%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0
        # %AF%81%E0%AE%B5%E0%AF%88%E0%AE%95%E0%AF%8D%E0%AE%95-%E0%AE%B5%E0%AF
        # %87%E0%AE%A3%E0%AF%8D%E0%AE%9F%E0%AF%81%E0%AE%AE%E0%AF%8D-%E0%AE%B0
        # %E0%AE%BE%E0%AE%AE%E0%AE%BE%E0%AF%8D-%E0%AE%95%E0%AF%8B%E0%AE%AF%E0
        # %AE%BF%E0%AE%B2%E0%AF%8D-%E0%AE%85%E0%AE%B1%E0%AE%95%E0%AF%8D%E0%AE
        # %95%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0%AE%B3%E0%AF%88%E0%AE%95%E0%AF%8D
        # %E0%AE%95%E0%AF%81-%E0%AE%AE%E0%AF%82%E0%AE%A4%E0%AF%8D%E0%AE%A4-
        # %E0%AE%B5%E0%AE%B4%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AF%81%E0%AE%B0%E0
        # %AF%88%E0%AE%9E%E0%AE%BE%E0%AF%8D-%E0%AE%95%E0%AE%9F%E0%AE%BF%E0%AE
        # %A4%E0%AE%AE%E0%AF%8D-3361308.html
        #
        # We might benefit from decoding the path in such URLs so that it fits
        # within 1024 characters, and perhaps more importantly, the
        # deduplication works better:
        #
        # http://dinamani.com/india/2020/feb/19/அயோத்தியில்-பாபா்-மசூதியை-சுற்றி
        # யுள்ள-மயானத்தை-விட்டுவைக்க-வேண்டும்-ராமா்-கோயில்-அறக்கட்டளைக்கு-மூத்த-வழ
        # க்குரைஞா்-கடிதம்-3361308.html
        if len(url) <= MAX_URL_LENGTH:
            db.query(
                """
                INSERT INTO story_urls (stories_id, url)
                VALUES (%(stories_id)s, %(url)s)
                ON CONFLICT (stories_id, url) DO NOTHING
                """, {
                    'stories_id': story['stories_id'],
                    'url': url
                })
def test_ignore_redirect():
    db = connect_to_db()

    # redirect_url = None
    assert not ignore_redirect(db, 'http://foo.com', None)

    # url = redirect_url
    assert not ignore_redirect(db, 'http://foo.com', 'http://foo.com')

    # empty topic_ignore_redirects
    assert not ignore_redirect(db, 'http://foo.com', 'http://bar.com')

    # match topic_ingnore_redirects
    redirect_url = 'http://foo.com/foo.bar'
    medium_url = generate_medium_url_and_name_from_url(redirect_url)[0]
    nu = normalize_url_lossy(medium_url)

    db.create('topic_ignore_redirects', {'url': nu})

    assert ignore_redirect(db, 'http://bar.com', redirect_url)

    # no match
    assert not ignore_redirect(db, 'http://bar.com', 'http://bat.com')
Example #20
0
def get_story_match(db: DatabaseHandler, url: str, redirect_url: Optional[str] = None) -> Optional[dict]:
    """Search for any story within the database that matches the given url.

    Searches for any story whose guid or url matches either the url or redirect_url or the
    mediawords.util.url.normalize_url_lossy() version of either.

    If multiple stories are found, use get_preferred_story() to decide which story to return.

    Only mach the first mediawords.dbi.stories.stories.MAX_URL_LENGTH characters of the url / redirect_url.

    Arguments:
    db - db handle
    url - story url
    redirect_url - optional url to which the story url redirects

    Returns:
    the matched story or None

    """
    u = url[0:MAX_URL_LENGTH]

    ru = ''
    if not ignore_redirect(db, url, redirect_url):
        ru = redirect_url[0:MAX_URL_LENGTH] if redirect_url is not None else u

    nu = normalize_url_lossy(u)
    nru = normalize_url_lossy(ru)

    urls = list({u, ru, nu, nru})

    # look for matching stories, ignore those in foreign_rss_links media, only get last
    # 100 to avoid hanging job trying to handle potentially thousands of matches
    stories = db.query("""
        WITH _matching_stories_stories AS (
            SELECT DISTINCT s.*
            FROM stories AS s
                INNER JOIN media AS m ON
                    s.media_id = m.media_id AND
                    m.foreign_rss_links = false
            WHERE
                s.url = ANY(%(urls)s) OR
                s.guid = ANY(%(urls)s)
            ORDER BY s.collect_date DESC
            LIMIT %(limit)s
        ),

        _matching_stories_story_urls AS (
            SELECT DISTINCT s.*
            FROM story_urls AS su
                INNER JOIN stories AS s ON
                    su.stories_id = s.stories_id
                INNER JOIN media AS m ON
                    s.media_id = m.media_id AND
                    m.foreign_rss_links = false
            WHERE su.url = ANY(%(urls)s)
            ORDER BY s.collect_date DESC
            LIMIT %(limit)s
        )
        
        SELECT DISTINCT *
        FROM (
            SELECT *
            FROM _matching_stories_stories

            UNION

            SELECT *
            FROM _matching_stories_story_urls
        ) AS m
        ORDER BY collect_date DESC
        LIMIT %(limit)s
    """, {
        'urls': urls,
        'limit': 100,
    }).hashes()

    if len(stories) == 0:
        return None

    story = get_preferred_story(db, stories)

    return story
Example #21
0
def get_story_match(db: DatabaseHandler,
                    url: str,
                    redirect_url: Optional[str] = None) -> Optional[dict]:
    """Search for any story within the database that matches the given url.

    Searches for any story whose guid or url matches either the url or redirect_url or the
    mediawords.util.url.normalize_url_lossy() version of either.

    If multiple stories are found, use get_preferred_story() to decide which story to return.

    Only mach the first mediawords.dbi.stories.stories.MAX_URL_LENGTH characters of the url / redirect_url.

    Arguments:
    db - db handle
    url - story url
    redirect_url - optional url to which the story url redirects

    Returns:
    the matched story or None

    """
    u = url[0:MAX_URL_LENGTH]

    ru = ''
    if not ignore_redirect(db, url, redirect_url):
        ru = redirect_url[0:MAX_URL_LENGTH] if redirect_url is not None else u

    nu = normalize_url_lossy(u)
    nru = normalize_url_lossy(ru)

    urls = list({u, ru, nu, nru})

    # for some reason some rare urls trigger a seq scan on the below query
    db.query("set enable_seqscan=off")

    # look for matching stories, ignore those in foreign_rss_links media, only get last
    # 100 to avoid hanging job trying to handle potentially thousands of matches
    stories = db.query(
        """
            with matching_stories as (
                select distinct(s.*)
                from stories s
                    join media m
                        on s.media_id = m.media_id
                where (
                        ( s.url = any( %(a)s ) )
                     or ( s.guid = any ( %(a)s ) )
                      )
                  and m.foreign_rss_links = false
            
                union
            
                select distinct(s.*)
                from stories s
                    join media m
                        on s.media_id = m.media_id
                    join story_urls su
                        on s.stories_id = su.stories_id
                where su.url = any ( %(a)s )
                  and m.foreign_rss_links = false
            )
            
            select distinct(ms.*)
            from matching_stories ms
            order by collect_date desc
            limit 100
        """, {
            'a': urls
        }).hashes()

    db.query("set enable_seqscan=on")

    if len(stories) == 0:
        return None

    story = get_preferred_story(db, stories)

    return story