Beispiel #1
0
 def setUpClass(cls) -> None:
     # All tests should be able to use the same database
     cls._DB = connect_to_db()
     cls._TEST_MEDIUM = create_test_medium(db=cls._DB, label='test')
     cls._TEST_FEED = create_test_feed(db=cls._DB,
                                       label='test',
                                       medium=cls._TEST_MEDIUM)
def test_get_preferred_story():
    """Test get_preferred_story()."""
    db = connect_to_db()

    num_media = 5
    media = []
    for i in range(num_media):
        medium = create_test_medium(db, "foo " + str(i))
        feed = create_test_feed(db=db, label="foo", medium=medium)
        story = create_test_story(db=db, label="foo", feed=feed)
        medium['story'] = story
        media.append(medium)

    # first prefer medium pointed to by dup_media_id of another story
    preferred_medium = media[1]
    db.query("update media set dup_media_id = %(a)s where media_id = %(b)s", {
        'a': preferred_medium['media_id'],
        'b': media[0]['media_id']
    })

    stories = [m['story'] for m in media]
    assert get_preferred_story(db, stories) == preferred_medium['story']

    # next prefer any medium without a dup_media_id
    preferred_medium = media[num_media - 1]
    # noinspection SqlWithoutWhere
    db.query("update media set dup_media_id = null")
    db.query("update media set dup_media_id = %(a)s where media_id != %(a)s",
             {'a': media[0]['media_id']})
    db.query("update media set dup_media_id = null where media_id = %(a)s",
             {'a': preferred_medium['media_id']})
    stories = [m['story'] for m in media[1:]]
    assert get_preferred_story(db, stories) == preferred_medium['story']

    # next prefer the medium whose story url matches the medium domain
    # noinspection SqlWithoutWhere
    db.query("update media set dup_media_id = null")
    # noinspection SqlWithoutWhere
    db.query("update media set url='http://media-'||media_id||'.com'")
    # noinspection SqlWithoutWhere
    db.query("update stories set url='http://stories-'||stories_id||'.com'")

    preferred_medium = media[2]
    db.query(
        "update stories set url = 'http://media-'||media_id||'.com' where media_id = %(a)s",
        {'a': preferred_medium['media_id']})
    stories = db.query("select * from stories").hashes()
    preferred_story = db.query("select * from stories where media_id = %(a)s",
                               {
                                   'a': preferred_medium['media_id']
                               }).hash()

    assert get_preferred_story(db, stories) == preferred_story

    # next prefer lowest media_id
    # noinspection SqlWithoutWhere
    db.query("update stories set url='http://stories-'||stories_id||'.com'")
    stories = db.query("select * from stories").hashes()
    assert get_preferred_story(
        db, stories)['stories_id'] == media[0]['story']['stories_id']
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium)
        self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)
Beispiel #4
0
def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s",
        {'a': topic['topics_id']}).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']
def test_copy_story_to_new_medium_with_download_error():
    """Test copy_story_to_new_medium with an associated download error."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']})

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    new_download = db.query(
        "select * from downloads where stories_id = %(a)s",
        {'a': new_story['stories_id']}).hash()
    assert new_download is not None
    assert new_download['state'] == 'error'
Beispiel #6
0
def test_add_story_description_unset():
    """Test adding a story without a description being set."""

    db = connect_to_db()

    medium = create_test_medium(db=db, label='test')
    feed = create_test_feed(db=db, label='test', medium=medium)

    story = {
        'url': 'http://test',
        'guid': 'http://test',
        'media_id': medium['media_id'],
        'title': "test",

        # stories.description can be NULL so it's a valid value:
        'description': None,
        'publish_date': '2016-10-15 08:00:00',
        'collect_date': '2016-10-15 10:00:00',
    }

    add_story(db=db, story=story, feeds_id=feed['feeds_id'])

    assert len(db.select(table='stories', what_to_select='*').hashes()) == 1
    assert len(
        db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)
def test_merge_dup_media_story():
    """Test merge_dup_media_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)
    old_story = create_test_story(db=db, label='merge old', feed=feed)

    new_medium = create_test_medium(db, 'merge new')

    db.update_by_id('media', medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    cloned_story = merge_dup_media_story(db, topic, old_story)

    for field in 'url guid publish_date title'.split():
        assert cloned_story[field] == old_story[field]

    topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': cloned_story['stories_id'],
            'b': topic['topics_id']
        }).hash()
    assert topic_story is not None

    merged_story = merge_dup_media_story(db, topic, old_story)
    assert merged_story['stories_id'] == cloned_story['stories_id']
def test_get_dup_story_groups():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    dup_story_groups = _get_dup_story_groups(db, topic)

    assert len(dup_story_groups) == 3

    for dsg in dup_story_groups:
        for story in dsg:
            assert dsg[0]['title'].lower() == story['title'].lower()
Beispiel #10
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.config = mediawords.util.config.get_config()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label='downloads est',
                                            feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(),
                                                       feed=self.test_feed,
                                                       story=self.test_story)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'],
                               self.test_download)

        mediawords.dbi.downloads.store_content(self.db(), self.test_download,
                                               self.__TEST_CONTENT)

        self.save_config = copy.deepcopy(self.config)
def test_provide_download_ids() -> None:
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, 'foo', medium=medium)

    hosts = ('foo.bar', 'bar.bat', 'bat.baz')
    downloads_per_host = 3

    for host in hosts:
        for i in range(downloads_per_host):
            download = {
                'feeds_id': feed['feeds_id'],
                'state': 'pending',
                'priority': 1,
                'sequence': 1,
                'type': 'content',
                'url': 'http://' + host + '/' + str(i),
                'host': host}

            db.create('downloads', download)

    download_ids = provide_download_ids(db)

    # +1 for the test feed
    assert len(download_ids) == len(hosts) + 1
def test_get_story_with_most_sentences():
    """Test _get_story_with_most_sentences()."""
    db = connect_to_db()

    medium = create_test_medium(db, "foo")
    feed = create_test_feed(db=db, label="foo", medium=medium)

    num_filled_stories = 5
    stories = []
    for i in range(num_filled_stories):
        story = create_test_story(db=db, label="foo" + str(i), feed=feed)
        stories.append(story)
        for n in range(1, i + 1):
            db.create(
                'story_sentences', {
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'sentence': 'foo',
                    'sentence_number': n,
                    'publish_date': story['publish_date']
                })

    empty_stories = []
    for i in range(2):
        story = create_test_story(db=db, label="foo empty" + str(i), feed=feed)
        empty_stories.append(story)
        stories.append(story)

    assert _get_story_with_most_sentences(
        db, stories) == stories[num_filled_stories - 1]

    assert _get_story_with_most_sentences(
        db, [empty_stories[0]]) == empty_stories[0]
    assert _get_story_with_most_sentences(db,
                                          empty_stories) == empty_stories[0]
Beispiel #13
0
def test_try_update_topic_link_ref_stories_id():
    """Test try_update_topic_link_ref_stories_id()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, label='foo', medium=medium)
    source_story = create_test_story(db, label='source story', feed=feed)
    target_story = create_test_story(db, label='target story a', feed=feed)

    topic = create_test_topic(db, 'foo')

    db.create('topic_stories', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id']})

    # first update should work
    topic_link_a = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_a = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_a)

    topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id'])

    assert topic_link_a['ref_stories_id'] == target_story['stories_id']

    # second one should silently fail
    topic_link_b = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_b = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_b)

    topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id'])

    assert topic_link_b['ref_stories_id'] is None

    # now generate an non-unique error and make sure we get an error
    bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'}

    with pytest.raises(McUpdateByIDException):
        try_update_topic_link_ref_stories_id(db, bogus_tfu)
Beispiel #14
0
def test_add_missing_normalized_title_hashes():
    db = connect_to_db()

    topic = create_test_topic(db, 'titles')
    medium = create_test_medium(db, 'titles')
    feed = create_test_feed(db, 'titles', medium=medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "titles " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    # disable trigger so that we can actually set normalized_title_hash to null
    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    db.query("""
        WITH all_story_ids AS (
            SELECT stories_id
            FROM stories
        )
        UPDATE stories SET
            normalized_title_hash = NULL
        WHERE stories_id IN (
            SELECT stories_id
            FROM all_story_ids
        )
    """)

    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    assert __count_null_title_stories(db=db, topic=topic) == num_stories

    _add_missing_normalized_title_hashes(db, topic)

    assert __count_null_title_stories(db=db, topic=topic) == 0
Beispiel #15
0
def test_add_tweet_story():
    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    medium = create_test_medium(db, 'test')
    feed = create_test_feed(db, 'test', medium)
    source_story = create_test_story(db, 'source', feed)

    topics_id = topic['topics_id']

    db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']})

    topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']}
    topic_link = db.create('topic_links', topic_link)

    tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']}
    tfu = db.create('topic_fetch_urls', tfu)

    tweet = {
        'id': 123,
        'text': 'add tweet story tweet text',
        'user': {'screen_name': 'tweet screen name'},
        'created_at': 'Mon Dec 13 23:21:48 +0000 2010',
        'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]},
        'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}},
        'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}}
    }

    story = _add_tweet_story(db, topic, tweet, [tfu])

    got_story = db.require_by_id('stories', story['stories_id'])

    assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text'])
    assert got_story['publish_date'][0:10] == '2010-12-13'
    assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id'])
    assert got_story['guid'] == story['url']

    got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id'])
    assert got_topic_link['ref_stories_id'] == story['stories_id']

    assert get_content_for_first_download(db, story) == tweet['text']

    got_topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {'a': story['stories_id'], 'b': topic['topics_id']}).hash()
    assert got_topic_story is not None
    assert got_topic_story['link_mined']

    # noinspection PyTypeChecker
    for url in [tweet['entities']['urls'][0]['expanded_url'],
                tweet['retweeted_status']['entities']['urls'][0]['expanded_url'],
                tweet['quoted_status']['entities']['urls'][0]['expanded_url']]:
        got_topic_link = db.query(
            "select * from topic_links where topics_id = %(a)s and url = %(b)s",
            {'a': topic['topics_id'], 'b': url}).hash()
        assert got_topic_link is not None
Beispiel #16
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME,
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label=self.TEST_STORY_NAME,
                                            feed=self.test_feed)
Beispiel #17
0
    def setUp(self):
        self.db = connect_to_db()

        self.test_medium = create_test_medium(db=self.db, label='test')
        self.test_feed = create_test_feed(db=self.db,
                                          label='test',
                                          medium=self.test_medium)
        self.story = create_test_story(db=self.db,
                                       label='test',
                                       feed=self.test_feed)

        stories_id = self.story['stories_id']

        enclosure = self.db.insert(
            table='story_enclosures',
            insert_hash={
                'stories_id': stories_id,
                # URL doesn't really matter as we won't be fetching it
                'url': 'http://example.com/',
                'mime_type': 'audio/mpeg',
                'length': 100000,
            })

        episode = self.db.insert(table='podcast_episodes',
                                 insert_hash={
                                     'stories_id':
                                     stories_id,
                                     'story_enclosures_id':
                                     enclosure['story_enclosures_id'],
                                     'gcs_uri':
                                     'gs://whatever',
                                     'duration':
                                     1,
                                     'codec':
                                     'MP3',
                                     'sample_rate':
                                     44100,
                                     'bcp47_language_code':
                                     'en-US',
                                     'speech_operation_id':
                                     'foo',
                                 })

        self.db.query(
            """
            INSERT INTO podcast_episode_transcript_fetches (
                podcast_episodes_id,
                add_to_queue_at
            ) VALUES (
                %(podcast_episodes_id)s,
                NOW()
            )
        """, {
                'podcast_episodes_id': episode['podcast_episodes_id'],
            })
def test_copy_story_to_new_medium():
    """Test copy_story_to_new_medium."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    for field in 'title url guid publish_date'.split():
        assert old_story[field] == new_story[field]

    topic_story_exists = db.query("""
        SELECT *
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = %(stories_id)s
    """, {
        'topics_id': topic['topics_id'],
        'stories_id': new_story['stories_id'],
    }).hash()
    assert topic_story_exists is not None

    new_download = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hash()
    assert new_download is not None

    content = fetch_content(db, new_download)
    assert content is not None and len(content) > 0

    story_sentences = db.query("""
        SELECT *
        FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hashes()
    assert len(story_sentences) > 0
Beispiel #19
0
    def setUp(self):
        """Add AP medium and some content so that we can find dup sentences."""
        super().setUp()

        ap_medium = create_test_medium(db=self.db(), label=get_ap_medium_name())
        feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium)
        story = create_test_story(db=self.db(), label='story', feed=feed)

        story['content'] = "\n".join(self.__get_ap_sentences())

        add_content_to_test_story(db=self.db(), story=story, feed=feed)
Beispiel #20
0
def test_merge_dup_stories():
    """Test merge_dup_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)

    num_stories = 10
    stories = []
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        stories.append(story)
        for j in range(i):
            # noinspection SqlInsertValues
            db.query(
                """
                INSERT INTO story_sentences (
                    stories_id,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date
                )
                    SELECT
                        stories_id,
                        %(sentence_number)s AS sentence_number,
                        'foo bar' AS sentence,
                        media_id,
                        publish_date
                    FROM stories
                    WHERE stories_id = %(stories_id)s
            """, {
                    'stories_id': story['stories_id'],
                    'sentence_number': j,
                })

    _merge_dup_stories(db, topic, stories)

    stories_ids = [s['stories_id'] for s in stories]
    merged_stories = db.query(
        """
        SELECT stories_id
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = ANY(%(stories_ids)s)
    """, {
            'topics_id': topic['topics_id'],
            'stories_ids': stories_ids,
        }).flat()

    assert merged_stories == [stories_ids[-1]]
Beispiel #21
0
    def setUp(self):
        super().setUp()

        self.db = connect_to_db()

        medium = create_test_medium(db=self.db, label='test')
        feed = create_test_feed(db=self.db, label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db,
                                      label='story-%d' % story_num,
                                      feed=feed)
            for sentence_number in range(
                    1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db.create(table='story_sentences',
                               insert_hash={
                                   'stories_id':
                                   story['stories_id'],
                                   'media_id':
                                   medium['media_id'],
                                   'publish_date':
                                   story['publish_date'],
                                   'sentence_number':
                                   sentence_number,
                                   'sentence':
                                   'story {}, sentence {}'.format(
                                       story['stories_id'], sentence_number),
                               })

        # Test topic
        topic = create_test_topic(db=self.db, label='test')
        self.topics_id = topic['topics_id']

        self.db.query(
            """
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db.query(
            """
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {
                'topics_id': self.topics_id
            }).flat()[0]

        self.db.query(
            """
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
Beispiel #23
0
    def setUp(self):
        """Add AP medium and some content so that we can find dup sentences."""
        super().setUp()

        ap_medium = create_test_medium(db=self.db(),
                                       label=get_ap_medium_name())
        feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium)
        story = create_test_story(db=self.db(), label='story', feed=feed)

        story['content'] = "\n".join(self.__get_ap_sentences())

        add_content_to_test_story(db=self.db(), story=story, feed=feed)
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_story = create_test_story(self.db(),
                                            label='downloads est',
                                            feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(),
                                                       feed=self.test_feed,
                                                       story=self.test_story)
    def test_update_extractor_version_tag(self):
        test_medium = create_test_medium(db=self.db(), label='test medium')
        test_feed = create_test_feed(db=self.db(), label='test feed', medium=test_medium)
        test_story = create_test_story(db=self.db(), label='test story', feed=test_feed)

        story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 0

        update_extractor_version_tag(db=self.db(), story=test_story)

        story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 1
Beispiel #26
0
def __is_syndicated(db: DatabaseHandler, content: str) -> bool:
    label = content[:64]

    medium = create_test_medium(db=db, label=label)
    feed = create_test_feed(db=db, label=label, medium=medium)
    story = create_test_story(db=db, label=label, feed=feed)

    story['content'] = content

    story = add_content_to_test_story(db=db, story=story, feed=feed)

    return is_syndicated(db=db, story_title=story['title'], story_text=content)
Beispiel #27
0
    def setUp(self) -> None:
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
Beispiel #28
0
    def __is_syndicated(self, content: str) -> bool:

        label = content[:64]

        medium = create_test_medium(db=self.db(), label=label)
        feed = create_test_feed(db=self.db(), label=label, medium=medium)
        story = create_test_story(db=self.db(), label=label, feed=feed)

        story['content'] = content

        story = add_content_to_test_story(db=self.db(), story=story, feed=feed)

        return is_syndicated(db=self.db(), story_title=story['title'], story_text=content)
Beispiel #29
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.db = connect_to_db()

        self.test_medium = create_test_medium(self.db, 'downloads test')
        self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium)
        self.test_download_feed = create_download_for_feed(self.db, self.test_feed)
        self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story)

        store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium)
        self.test_download = create_download_for_feed(self.db(), self.test_feed)
        self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test',
                                          self.test_medium)
        self.test_download = create_download_for_feed(self.db(),
                                                      self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.db().update_by_id('downloads', self.test_download['downloads_id'],
                               self.test_download)
def test_mark_as_processed():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label=TEST_MEDIUM_NAME)
    test_feed = create_test_feed(db=db, label=TEST_FEED_NAME, medium=test_medium)
    test_story = create_test_story(db=db, label=TEST_STORY_NAME, feed=test_feed)

    processed_stories = db.query("SELECT * FROM processed_stories").hashes()
    assert len(processed_stories) == 0

    mark_as_processed(db=db, stories_id=test_story['stories_id'])

    processed_stories = db.query("SELECT * FROM processed_stories").hashes()
    assert len(processed_stories) == 1
    assert processed_stories[0]['stories_id'] == test_story['stories_id']
Beispiel #33
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.db = connect_to_db()

        self.test_medium = create_test_medium(self.db, self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db, self.TEST_FEED_NAME,
                                          self.test_medium)
        self.test_story = create_test_story(self.db,
                                            label=self.TEST_STORY_NAME,
                                            feed=self.test_feed)
        self.test_download = create_download_for_story(self.db,
                                                       feed=self.test_feed,
                                                       story=self.test_story)
def test_find_and_merge_dup_stories():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    find_and_merge_dup_stories(db, topic)

    num_topic_stories = db.query(
        """
        SELECT COUNT(*)
        FROM topic_stories
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).flat()[0]

    assert num_topic_stories == 3

    num_distinct_titles = db.query(
        """
        SELECT COUNT(DISTINCT normalized_title_hash)
        FROM snap.live_stories
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).flat()[0]

    assert num_distinct_titles == 3
    def setUp(self) -> None:
        super().setUp()

        self.db = connect_to_db()

        test_medium = create_test_medium(db=self.db, label='test')
        test_feed = create_test_feed(db=self.db,
                                     label='test',
                                     medium=test_medium)
        test_story = create_test_story(db=self.db,
                                       feed=test_feed,
                                       label='test')

        self.enclosure = self.db.insert(table='story_enclosures',
                                        insert_hash={
                                            'stories_id':
                                            test_story['stories_id'],
                                            'url': 'foo',
                                            'mime_type': 'foo',
                                            'length': 3,
                                        })

        self.episode = self.db.insert(
            table='podcast_episodes',
            insert_hash={
                'stories_id': test_story['stories_id'],
                'story_enclosures_id': self.enclosure['story_enclosures_id'],
                'gcs_uri': 'gs://test',
                'duration': 3,
                'codec': 'FLAC',
                'sample_rate': 44100,
                'bcp47_language_code': 'en-US',
                'speech_operation_id': self.MOCK_SPEECH_OPERATION_ID,
            })

        self.transcript_fetch = self.db.query(
            """
            INSERT INTO podcast_episode_transcript_fetches (podcast_episodes_id, add_to_queue_at)
            VALUES (%(podcast_episodes_id)s, NOW())
            RETURNING *
        """, {
                'podcast_episodes_id': self.episode['podcast_episodes_id'],
            }).hash()

        self.podcast_episode_transcript_fetches_id = self.transcript_fetch[
            'podcast_episode_transcript_fetches_id']
Beispiel #36
0
    def test_update_extractor_version_tag(self):
        test_medium = create_test_medium(db=self.db(), label='test medium')
        test_feed = create_test_feed(db=self.db(),
                                     label='test feed',
                                     medium=test_medium)
        test_story = create_test_story(db=self.db(),
                                       label='test story',
                                       feed=test_feed)

        story_extractor_tags = self.__story_extractor_tags(
            stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 0

        update_extractor_version_tag(db=self.db(), story=test_story)

        story_extractor_tags = self.__story_extractor_tags(
            stories_id=test_story['stories_id'])
        assert len(story_extractor_tags) == 1
Beispiel #37
0
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME)
        self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME,
                                          self.test_medium)
        self.test_download = create_download_for_feed(self.db(),
                                                      self.test_feed)
        self.test_story = create_test_story(self.db(),
                                            label=self.TEST_STORY_NAME,
                                            feed=self.test_feed)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'],
                               self.test_download)
    def setUp(self) -> None:
        """Set config for tests."""
        super().setUp()

        self.config = mediawords.util.config.get_config()

        self.test_medium = create_test_medium(self.db(), 'downloads test')
        self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium)
        self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed)
        self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)

        self.test_download['path'] = 'postgresql:foo'
        self.test_download['state'] = 'success'
        self.test_download['stories_id'] = self.test_story['stories_id']
        self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)

        mediawords.dbi.downloads.store_content(self.db(), self.test_download, self.__TEST_CONTENT)

        self.save_config = copy.deepcopy(self.config)
    def setUp(self):
        super().setUp()

        medium = create_test_medium(db=self.db(), label='test')
        feed = create_test_feed(db=self.db(), label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed)
            for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db().create(table='story_sentences', insert_hash={
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'publish_date': story['publish_date'],
                    'sentence_number': sentence_number,
                    'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number),
                })

        # Test topic
        topic = create_test_topic(db=self.db(), label='test')
        self.topics_id = topic['topics_id']

        self.db().query("""
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db().query("""
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {'topics_id': self.topics_id}).flat()[0]

        self.db().query("""
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})