def test_copy_story_to_new_medium_with_download_error():
    """Test copy_story_to_new_medium with an associated download error."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']})

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    new_download = db.query(
        "select * from downloads where stories_id = %(a)s",
        {'a': new_story['stories_id']}).hash()
    assert new_download is not None
    assert new_download['state'] == 'error'
コード例 #2
0
def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s",
        {'a': topic['topics_id']}).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']
コード例 #3
0
def test_merge_dup_media_story():
    """Test merge_dup_media_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)
    old_story = create_test_story(db=db, label='merge old', feed=feed)

    new_medium = create_test_medium(db, 'merge new')

    db.update_by_id('media', medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    cloned_story = merge_dup_media_story(db, topic, old_story)

    for field in 'url guid publish_date title'.split():
        assert cloned_story[field] == old_story[field]

    topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': cloned_story['stories_id'],
            'b': topic['topics_id']
        }).hash()
    assert topic_story is not None

    merged_story = merge_dup_media_story(db, topic, old_story)
    assert merged_story['stories_id'] == cloned_story['stories_id']
コード例 #4
0
def test_copy_story_to_new_medium():
    """Test copy_story_to_new_medium."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    for field in 'title url guid publish_date'.split():
        assert old_story[field] == new_story[field]

    topic_story_exists = db.query("""
        SELECT *
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = %(stories_id)s
    """, {
        'topics_id': topic['topics_id'],
        'stories_id': new_story['stories_id'],
    }).hash()
    assert topic_story_exists is not None

    new_download = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hash()
    assert new_download is not None

    content = fetch_content(db, new_download)
    assert content is not None and len(content) > 0

    story_sentences = db.query("""
        SELECT *
        FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hashes()
    assert len(story_sentences) > 0
コード例 #5
0
def test_get_preferred_story():
    """Test get_preferred_story()."""
    db = connect_to_db()

    num_media = 5
    media = []
    for i in range(num_media):
        medium = create_test_medium(db, "foo " + str(i))
        feed = create_test_feed(db=db, label="foo", medium=medium)
        story = create_test_story(db=db, label="foo", feed=feed)
        medium['story'] = story
        media.append(medium)

    # first prefer medium pointed to by dup_media_id of another story
    preferred_medium = media[1]
    db.query("update media set dup_media_id = %(a)s where media_id = %(b)s", {
        'a': preferred_medium['media_id'],
        'b': media[0]['media_id']
    })

    stories = [m['story'] for m in media]
    assert get_preferred_story(db, stories) == preferred_medium['story']

    # next prefer any medium without a dup_media_id
    preferred_medium = media[num_media - 1]
    # noinspection SqlWithoutWhere
    db.query("update media set dup_media_id = null")
    db.query("update media set dup_media_id = %(a)s where media_id != %(a)s",
             {'a': media[0]['media_id']})
    db.query("update media set dup_media_id = null where media_id = %(a)s",
             {'a': preferred_medium['media_id']})
    stories = [m['story'] for m in media[1:]]
    assert get_preferred_story(db, stories) == preferred_medium['story']

    # next prefer the medium whose story url matches the medium domain
    # noinspection SqlWithoutWhere
    db.query("update media set dup_media_id = null")
    # noinspection SqlWithoutWhere
    db.query("update media set url='http://media-'||media_id||'.com'")
    # noinspection SqlWithoutWhere
    db.query("update stories set url='http://stories-'||stories_id||'.com'")

    preferred_medium = media[2]
    db.query(
        "update stories set url = 'http://media-'||media_id||'.com' where media_id = %(a)s",
        {'a': preferred_medium['media_id']})
    stories = db.query("select * from stories").hashes()
    preferred_story = db.query("select * from stories where media_id = %(a)s",
                               {
                                   'a': preferred_medium['media_id']
                               }).hash()

    assert get_preferred_story(db, stories) == preferred_story

    # next prefer lowest media_id
    # noinspection SqlWithoutWhere
    db.query("update stories set url='http://stories-'||stories_id||'.com'")
    stories = db.query("select * from stories").hashes()
    assert get_preferred_story(
        db, stories)['stories_id'] == media[0]['story']['stories_id']
コード例 #6
0
 def setUpClass(cls) -> None:
     # All tests should be able to use the same database
     cls._DB = connect_to_db()
     cls._TEST_MEDIUM = create_test_medium(db=cls._DB, label='test')
     cls._TEST_FEED = create_test_feed(db=cls._DB,
                                       label='test',
                                       medium=cls._TEST_MEDIUM)
コード例 #7
0
def test_add_story_description_unset():
    """Test adding a story without a description being set."""

    db = connect_to_db()

    medium = create_test_medium(db=db, label='test')
    feed = create_test_feed(db=db, label='test', medium=medium)

    story = {
        'url': 'http://test',
        'guid': 'http://test',
        'media_id': medium['media_id'],
        'title': "test",

        # stories.description can be NULL so it's a valid value:
        'description': None,
        'publish_date': '2016-10-15 08:00:00',
        'collect_date': '2016-10-15 10:00:00',
    }

    add_story(db=db, story=story, feeds_id=feed['feeds_id'])

    assert len(db.select(table='stories', what_to_select='*').hashes()) == 1
    assert len(
        db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1
コード例 #8
0
def test_normalized_urls_out_of_date():
    """Test _normalized_urls_out_of_date()."""
    db = connect_to_db()

    assert not _normalized_urls_out_of_date(db)

    [create_test_medium(db, str(i)) for i in range(5)]

    assert _normalized_urls_out_of_date(db)

    # noinspection SqlWithoutWhere
    db.query("update media set normalized_url = url")

    assert not _normalized_urls_out_of_date(db)

    db.query(
        "update media set normalized_url = null where media_id in ( select media_id from media limit 1 )"
    )

    assert _normalized_urls_out_of_date(db)

    # noinspection SqlWithoutWhere
    db.query("update media set normalized_url = url")

    assert not _normalized_urls_out_of_date(db)
コード例 #9
0
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)
コード例 #10
0
def test_get_dup_story_groups():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    dup_story_groups = _get_dup_story_groups(db, topic)

    assert len(dup_story_groups) == 3

    for dsg in dup_story_groups:
        for story in dsg:
            assert dsg[0]['title'].lower() == story['title'].lower()
コード例 #11
0
ファイル: test_downloads.py プロジェクト: umatter/mediacloud
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.config = mediawords.util.config.get_config()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label='downloads est',
                                            feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(),
                                                       feed=self.test_feed,
                                                       story=self.test_story)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'],
                               self.test_download)

        mediawords.dbi.downloads.store_content(self.db(), self.test_download,
                                               self.__TEST_CONTENT)

        self.save_config = copy.deepcopy(self.config)
コード例 #12
0
def test_provide_download_ids() -> None:
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, 'foo', medium=medium)

    hosts = ('foo.bar', 'bar.bat', 'bat.baz')
    downloads_per_host = 3

    for host in hosts:
        for i in range(downloads_per_host):
            download = {
                'feeds_id': feed['feeds_id'],
                'state': 'pending',
                'priority': 1,
                'sequence': 1,
                'type': 'content',
                'url': 'http://' + host + '/' + str(i),
                'host': host}

            db.create('downloads', download)

    download_ids = provide_download_ids(db)

    # +1 for the test feed
    assert len(download_ids) == len(hosts) + 1
コード例 #13
0
def test_get_story_with_most_sentences():
    """Test _get_story_with_most_sentences()."""
    db = connect_to_db()

    medium = create_test_medium(db, "foo")
    feed = create_test_feed(db=db, label="foo", medium=medium)

    num_filled_stories = 5
    stories = []
    for i in range(num_filled_stories):
        story = create_test_story(db=db, label="foo" + str(i), feed=feed)
        stories.append(story)
        for n in range(1, i + 1):
            db.create(
                'story_sentences', {
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'sentence': 'foo',
                    'sentence_number': n,
                    'publish_date': story['publish_date']
                })

    empty_stories = []
    for i in range(2):
        story = create_test_story(db=db, label="foo empty" + str(i), feed=feed)
        empty_stories.append(story)
        stories.append(story)

    assert _get_story_with_most_sentences(
        db, stories) == stories[num_filled_stories - 1]

    assert _get_story_with_most_sentences(
        db, [empty_stories[0]]) == empty_stories[0]
    assert _get_story_with_most_sentences(db,
                                          empty_stories) == empty_stories[0]
コード例 #14
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium)
        self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)
コード例 #15
0
def test_try_update_topic_link_ref_stories_id():
    """Test try_update_topic_link_ref_stories_id()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, label='foo', medium=medium)
    source_story = create_test_story(db, label='source story', feed=feed)
    target_story = create_test_story(db, label='target story a', feed=feed)

    topic = create_test_topic(db, 'foo')

    db.create('topic_stories', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id']})

    # first update should work
    topic_link_a = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_a = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_a)

    topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id'])

    assert topic_link_a['ref_stories_id'] == target_story['stories_id']

    # second one should silently fail
    topic_link_b = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_b = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_b)

    topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id'])

    assert topic_link_b['ref_stories_id'] is None

    # now generate an non-unique error and make sure we get an error
    bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'}

    with pytest.raises(McUpdateByIDException):
        try_update_topic_link_ref_stories_id(db, bogus_tfu)
コード例 #16
0
def test_add_missing_normalized_title_hashes():
    db = connect_to_db()

    topic = create_test_topic(db, 'titles')
    medium = create_test_medium(db, 'titles')
    feed = create_test_feed(db, 'titles', medium=medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "titles " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    # disable trigger so that we can actually set normalized_title_hash to null
    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    db.query("""
        WITH all_story_ids AS (
            SELECT stories_id
            FROM stories
        )
        UPDATE stories SET
            normalized_title_hash = NULL
        WHERE stories_id IN (
            SELECT stories_id
            FROM all_story_ids
        )
    """)

    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    assert __count_null_title_stories(db=db, topic=topic) == num_stories

    _add_missing_normalized_title_hashes(db, topic)

    assert __count_null_title_stories(db=db, topic=topic) == 0
コード例 #17
0
def test_guess_medium() -> None:
    """Test guess_medium()."""
    db = connect_to_db()

    num_media = 5
    [create_test_medium(db, str(i)) for i in range(num_media)]

    # the default test media do not have unique domains
    # noinspection SqlWithoutWhere
    db.query("update media set url = 'http://media-' || media_id ||'.com'")

    # dummy guess_medium call to assign normalized_urls
    guess_medium(db, 'foo')

    media = db.query("select * from media order by media_id").hashes()

    # basic lookup of existing media
    assert guess_medium(db, media[0]['url']) == media[0]
    assert guess_medium(db, media[1]['url'] + '/foo/bar/') == media[1]
    assert guess_medium(db, media[2]['url'] + URL_SPIDERED_SUFFIX) == media[2]

    # create a new medium
    new_medium_story_url = 'http://new-medium.com/with/path'
    new_medium = guess_medium(db, new_medium_story_url)
    assert new_medium['name'] == 'new-medium.com'
    assert new_medium['url'] == 'http://new-medium.com/'

    spidered_tag = get_spidered_tag(db)
    spidered_mtm = db.query(
        "select * from media_tags_map where tags_id = %(a)s and media_id = %(b)s",
        {'a': spidered_tag['tags_id'], 'b': new_medium['media_id']})
    assert spidered_mtm is not None

    # find the url with some url varients
    new_medium_url_variants = [
        'http://new-medium.com/with/another/path',
        'http://www.new-medium.com/',
        'http://new-medium.com/with/path#andanchor'
    ]

    for url in new_medium_url_variants:
        assert guess_medium(db, url)['media_id'] == new_medium['media_id']

    # set foreign_rss_links to true to make guess_medium create another new medium
    db.query("update media set foreign_rss_links = 't' where media_id = %(a)s", {'a': new_medium['media_id']})

    another_new_medium = guess_medium(db, new_medium_story_url)
    assert another_new_medium['media_id'] > new_medium['media_id']
    assert another_new_medium['url'] == new_medium_story_url
    assert another_new_medium['name'] == 'http://new-medium.com/'

    # now try finding a dup
    db.query(
        "update media set dup_media_id = %(a)s where media_id = %(b)s",
        {'a': media[0]['media_id'], 'b': media[1]['media_id']})

    assert guess_medium(db, media[1]['url'])['media_id'] == media[0]['media_id']
コード例 #18
0
def test_add_tweet_story():
    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    medium = create_test_medium(db, 'test')
    feed = create_test_feed(db, 'test', medium)
    source_story = create_test_story(db, 'source', feed)

    topics_id = topic['topics_id']

    db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']})

    topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']}
    topic_link = db.create('topic_links', topic_link)

    tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']}
    tfu = db.create('topic_fetch_urls', tfu)

    tweet = {
        'id': 123,
        'text': 'add tweet story tweet text',
        'user': {'screen_name': 'tweet screen name'},
        'created_at': 'Mon Dec 13 23:21:48 +0000 2010',
        'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]},
        'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}},
        'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}}
    }

    story = _add_tweet_story(db, topic, tweet, [tfu])

    got_story = db.require_by_id('stories', story['stories_id'])

    assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text'])
    assert got_story['publish_date'][0:10] == '2010-12-13'
    assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id'])
    assert got_story['guid'] == story['url']

    got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id'])
    assert got_topic_link['ref_stories_id'] == story['stories_id']

    assert get_content_for_first_download(db, story) == tweet['text']

    got_topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {'a': story['stories_id'], 'b': topic['topics_id']}).hash()
    assert got_topic_story is not None
    assert got_topic_story['link_mined']

    # noinspection PyTypeChecker
    for url in [tweet['entities']['urls'][0]['expanded_url'],
                tweet['retweeted_status']['entities']['urls'][0]['expanded_url'],
                tweet['quoted_status']['entities']['urls'][0]['expanded_url']]:
        got_topic_link = db.query(
            "select * from topic_links where topics_id = %(a)s and url = %(b)s",
            {'a': topic['topics_id'], 'b': url}).hash()
        assert got_topic_link is not None
コード例 #19
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME,
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label=self.TEST_STORY_NAME,
                                            feed=self.test_feed)
コード例 #20
0
    def setUp(self):
        self.db = connect_to_db()

        self.test_medium = create_test_medium(db=self.db, label='test')
        self.test_feed = create_test_feed(db=self.db,
                                          label='test',
                                          medium=self.test_medium)
        self.story = create_test_story(db=self.db,
                                       label='test',
                                       feed=self.test_feed)

        stories_id = self.story['stories_id']

        enclosure = self.db.insert(
            table='story_enclosures',
            insert_hash={
                'stories_id': stories_id,
                # URL doesn't really matter as we won't be fetching it
                'url': 'http://example.com/',
                'mime_type': 'audio/mpeg',
                'length': 100000,
            })

        episode = self.db.insert(table='podcast_episodes',
                                 insert_hash={
                                     'stories_id':
                                     stories_id,
                                     'story_enclosures_id':
                                     enclosure['story_enclosures_id'],
                                     'gcs_uri':
                                     'gs://whatever',
                                     'duration':
                                     1,
                                     'codec':
                                     'MP3',
                                     'sample_rate':
                                     44100,
                                     'bcp47_language_code':
                                     'en-US',
                                     'speech_operation_id':
                                     'foo',
                                 })

        self.db.query(
            """
            INSERT INTO podcast_episode_transcript_fetches (
                podcast_episodes_id,
                add_to_queue_at
            ) VALUES (
                %(podcast_episodes_id)s,
                NOW()
            )
        """, {
                'podcast_episodes_id': episode['podcast_episodes_id'],
            })
コード例 #21
0
ファイル: test_ap.py プロジェクト: berkmancenter/mediacloud
    def setUp(self):
        """Add AP medium and some content so that we can find dup sentences."""
        super().setUp()

        ap_medium = create_test_medium(db=self.db(), label=get_ap_medium_name())
        feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium)
        story = create_test_story(db=self.db(), label='story', feed=feed)

        story['content'] = "\n".join(self.__get_ap_sentences())

        add_content_to_test_story(db=self.db(), story=story, feed=feed)
コード例 #22
0
    def setUp(self):
        super().setUp()

        self.db = connect_to_db()

        medium = create_test_medium(db=self.db, label='test')
        feed = create_test_feed(db=self.db, label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db,
                                      label='story-%d' % story_num,
                                      feed=feed)
            for sentence_number in range(
                    1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db.create(table='story_sentences',
                               insert_hash={
                                   'stories_id':
                                   story['stories_id'],
                                   'media_id':
                                   medium['media_id'],
                                   'publish_date':
                                   story['publish_date'],
                                   'sentence_number':
                                   sentence_number,
                                   'sentence':
                                   'story {}, sentence {}'.format(
                                       story['stories_id'], sentence_number),
                               })

        # Test topic
        topic = create_test_topic(db=self.db, label='test')
        self.topics_id = topic['topics_id']

        self.db.query(
            """
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db.query(
            """
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {
                'topics_id': self.topics_id
            }).flat()[0]

        self.db.query(
            """
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})
コード例 #23
0
def test_merge_dup_stories():
    """Test merge_dup_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)

    num_stories = 10
    stories = []
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        stories.append(story)
        for j in range(i):
            # noinspection SqlInsertValues
            db.query(
                """
                INSERT INTO story_sentences (
                    stories_id,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date
                )
                    SELECT
                        stories_id,
                        %(sentence_number)s AS sentence_number,
                        'foo bar' AS sentence,
                        media_id,
                        publish_date
                    FROM stories
                    WHERE stories_id = %(stories_id)s
            """, {
                    'stories_id': story['stories_id'],
                    'sentence_number': j,
                })

    _merge_dup_stories(db, topic, stories)

    stories_ids = [s['stories_id'] for s in stories]
    merged_stories = db.query(
        """
        SELECT stories_id
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = ANY(%(stories_ids)s)
    """, {
            'topics_id': topic['topics_id'],
            'stories_ids': stories_ids,
        }).flat()

    assert merged_stories == [stories_ids[-1]]
コード例 #24
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
コード例 #25
0
def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        """
        WITH found_topic_stories AS (
            SELECT stories_id
            FROM topic_stories
            WHERE topics_id = %(topics_id)s
        )

        SELECT *
        FROM stories
        WHERE stories_id IN (
            SELECT stories_id
            FROM found_topic_stories
        )
        """, {
            'topics_id': topic['topics_id']
        }).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']
コード例 #26
0
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
コード例 #27
0
    def setUp(self):
        """Add AP medium and some content so that we can find dup sentences."""
        super().setUp()

        ap_medium = create_test_medium(db=self.db(),
                                       label=get_ap_medium_name())
        feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium)
        story = create_test_story(db=self.db(), label='story', feed=feed)

        story['content'] = "\n".join(self.__get_ap_sentences())

        add_content_to_test_story(db=self.db(), story=story, feed=feed)
コード例 #28
0
    def test_update_extractor_version_tag(self):
        test_medium = create_test_medium(db=self.db(), label='test medium')
        test_feed = create_test_feed(db=self.db(), label='test feed', medium=test_medium)
        test_story = create_test_story(db=self.db(), label='test story', feed=test_feed)

        story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 0

        update_extractor_version_tag(db=self.db(), story=test_story)

        story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 1
コード例 #29
0
ファイル: test_ap.py プロジェクト: vishalbelsare/mediacloud
def __is_syndicated(db: DatabaseHandler, content: str) -> bool:
    label = content[:64]

    medium = create_test_medium(db=db, label=label)
    feed = create_test_feed(db=db, label=label, medium=medium)
    story = create_test_story(db=db, label=label, feed=feed)

    story['content'] = content

    story = add_content_to_test_story(db=db, story=story, feed=feed)

    return is_syndicated(db=db, story_title=story['title'], story_text=content)
コード例 #30
0
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label='downloads est',
                                            feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(),
                                                       feed=self.test_feed,
                                                       story=self.test_story)
コード例 #31
0
def test_update_media_normalized_urls():
    """Test _update_media_normalized_urls()."""
    db = connect_to_db()

    [create_test_medium(db, str(i)) for i in range(5)]

    _update_media_normalized_urls(db)

    media = db.query("select * from media").hashes()
    for medium in media:
        expected_nu = normalize_url_lossy(medium['url'])
        assert (medium['url'] == expected_nu)
コード例 #32
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.db = connect_to_db()

        self.test_medium = create_test_medium(self.db, 'downloads test')
        self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium)
        self.test_download_feed = create_download_for_feed(self.db, self.test_feed)
        self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story)

        store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
コード例 #33
0
ファイル: test_ap.py プロジェクト: berkmancenter/mediacloud
    def __is_syndicated(self, content: str) -> bool:

        label = content[:64]

        medium = create_test_medium(db=self.db(), label=label)
        feed = create_test_feed(db=self.db(), label=label, medium=medium)
        story = create_test_story(db=self.db(), label=label, feed=feed)

        story['content'] = content

        story = add_content_to_test_story(db=self.db(), story=story, feed=feed)

        return is_syndicated(db=self.db(), story_title=story['title'], story_text=content)
コード例 #34
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)
        self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
コード例 #35
0
def test_get_spider_feed():
    """Test get_spider_feed()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')

    feed = get_spider_feed(db, medium)

    assert feed['name'] == SPIDER_FEED_NAME
    assert feed['media_id'] == medium['media_id']
    assert feed['active'] is False

    assert get_spider_feed(db, medium)['feeds_id'] == feed['feeds_id']
コード例 #36
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_download = create_download_for_feed(self.db(),
                                                      self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.db().update_by_id('downloads', self.test_download['downloads_id'],
                               self.test_download)
コード例 #37
0
def test_get_normalized_title():
    db = connect_to_db()

    # simple title
    (got_title,) = db.query("select get_normalized_title('foo bar', 0)").flat()
    assert got_title == 'foo bar'

    # simple title part
    title_part = "foo barfoo barfoo barfoo barfoo bar"
    title = title_part + ': bat baz'
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat()
    assert got_title == title_part

    title_part = "foo barfoo barfoo barfoo barfoo bar"
    title = 'bat baz: ' + title_part
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat()
    assert got_title == title_part

    title_part = "foo barfoo barfoo barfoo barfoo bar"
    title = 'bat baz - ' + title_part
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat()
    assert got_title == title_part

    # strip punctuation
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': 'foo!@#bar&*('}).flat()
    assert got_title == 'foobar'

    # require 32 character length
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': 'foo bar: bat'}).flat()
    assert got_title == 'foo barSEPSEP bat'

    # don't allow medium name as title part
    medium_name = 'A' * 64
    create_test_medium(db, medium_name)
    title = medium_name + ': foo bar'
    (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat()
    assert got_title == medium_name.lower() + 'SEPSEP foo bar'
コード例 #38
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.config = mediawords.util.config.get_config()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)

        mediawords.dbi.downloads.store_content(self.db(), self.test_download, self.__TEST_CONTENT)

        self.save_config = copy.deepcopy(self.config)
コード例 #39
0
    def setUp(self):
        super().setUp()

        medium = create_test_medium(db=self.db(), label='test')
        feed = create_test_feed(db=self.db(), label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed)
            for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db().create(table='story_sentences', insert_hash={
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'publish_date': story['publish_date'],
                    'sentence_number': sentence_number,
                    'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number),
                })

        # Test topic
        topic = create_test_topic(db=self.db(), label='test')
        self.topics_id = topic['topics_id']

        self.db().query("""
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db().query("""
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {'topics_id': self.topics_id}).flat()[0]

        self.db().query("""
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})