コード例 #1
0
    def test_get_story_with_most_sentences(self) -> None:
        """Test _get_story_with_most_senences()."""
        db = self.db()

        medium = mediawords.test.db.create_test_medium(db, "foo")
        feed = mediawords.test.db.create_test_feed(db=db, label="foo", medium=medium)

        num_filled_stories = 5
        stories = []
        for i in range(num_filled_stories):
            story = mediawords.test.db.create_test_story(db=db, label="foo" + str(i), feed=feed)
            stories.append(story)
            for n in range(1, i + 1):
                db.create('story_sentences', {
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'sentence': 'foo',
                    'sentence_number': n,
                    'publish_date': story['publish_date']})

        empty_stories = []
        for i in range(2):
            story = mediawords.test.db.create_test_story(db=db, label="foo empty" + str(i), feed=feed)
            empty_stories.append(story)
            stories.append(story)

        assert mediawords.tm.stories._get_story_with_most_sentences(db, stories) == stories[num_filled_stories - 1]

        assert mediawords.tm.stories._get_story_with_most_sentences(db, [empty_stories[0]]) == empty_stories[0]
        assert mediawords.tm.stories._get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
コード例 #2
0
    def test_find_and_merge_dup_stories(self) -> None:
        """Test find_and_merge_dup_stories()."""
        db = self.db()

        topic = mediawords.test.db.create.create_test_topic(db, 'merge')
        medium = mediawords.test.db.create.create_test_medium(db, 'merge')
        feed = mediawords.test.db.create.create_test_feed(db, 'merge', medium=medium)

        num_stories = 10
        stories = []
        for i in range(num_stories):
            story = mediawords.test.db.create.create_test_story(db, "merge " + str(i), feed=feed)
            db.update_by_id('stories', story['stories_id'], {'title': "long dup title foo bar baz"})
            mediawords.tm.stories.add_to_topic_stories(db, story, topic)
            stories.append(story)
            for j in range(i):
                db.query(
                    """
                    insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date)
                        select stories_id, %(b)s, 'foo bar', media_id, publish_date
                            from stories where stories_id = %(a)s
                    """,
                    {'a': story['stories_id'], 'b': j})

        mediawords.tm.stories.find_and_merge_dup_stories(db, topic)

        stories_ids = [s['stories_id'] for s in stories]
        merged_stories = db.query(
            "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)",
            {'a': topic['topics_id'], 'b': stories_ids}).flat()

        assert merged_stories == [stories_ids[-1]]
コード例 #3
0
    def test_get_story_with_most_sentences(self) -> None:
        """Test _get_story_with_most_senences()."""
        db = self.db()

        medium = mediawords.test.db.create.create_test_medium(db, "foo")
        feed = mediawords.test.db.create.create_test_feed(db=db, label="foo", medium=medium)

        num_filled_stories = 5
        stories = []
        for i in range(num_filled_stories):
            story = mediawords.test.db.create.create_test_story(db=db, label="foo" + str(i), feed=feed)
            stories.append(story)
            for n in range(1, i + 1):
                db.create('story_sentences', {
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'sentence': 'foo',
                    'sentence_number': n,
                    'publish_date': story['publish_date']})

        empty_stories = []
        for i in range(2):
            story = mediawords.test.db.create.create_test_story(db=db, label="foo empty" + str(i), feed=feed)
            empty_stories.append(story)
            stories.append(story)

        assert mediawords.tm.stories._get_story_with_most_sentences(db, stories) == stories[num_filled_stories - 1]

        assert mediawords.tm.stories._get_story_with_most_sentences(db, [empty_stories[0]]) == empty_stories[0]
        assert mediawords.tm.stories._get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
コード例 #4
0
    def test_get_story_match(self) -> None:
        """Test get_story_match()."""
        db = self.db()

        medium = mediawords.test.db.create.create_test_medium(db, 'foo')
        num_stories = 10
        stories = []
        for i in range(num_stories):
            story = db.create(
                'stories', {
                    'media_id': medium['media_id'],
                    'url': ('http://stories-%d.com/foo/bar' % i),
                    'guid': ('http://stories-%d.com/foo/bar/guid' % i),
                    'title': ('story %d' % i),
                    'publish_date': '2017-01-01'
                })
            stories.append(story)

        # None
        assert mediawords.tm.stories.get_story_match(db,
                                                     'http://foo.com') is None

        # straight and normalized versions of url and redirect_url
        assert mediawords.tm.stories.get_story_match(
            db, stories[0]['url']) == stories[0]
        assert mediawords.tm.stories.get_story_match(
            db, 'http://foo.com', stories[1]['url']) == stories[1]
        assert mediawords.tm.stories.get_story_match(
            db, stories[2]['url'] + '#foo') == stories[2]
        assert mediawords.tm.stories.get_story_match(
            db, 'http://foo.com', stories[3]['url'] + '#foo') == stories[3]

        # get_preferred_story - return only story with sentences
        db.query(
            """
            insert into story_sentences ( stories_id, media_id, publish_date, sentence, sentence_number )
                select stories_id, media_id, publish_date, 'foo', 1 from stories where stories_id = %(a)s
            """, {'a': stories[4]['stories_id']})
        stories = db.query(
            "update stories set url = 'http://stories.com/' returning *"
        ).hashes()

        assert mediawords.tm.stories.get_story_match(
            db, 'http://stories.com/') == stories[4]
コード例 #5
0
ファイル: test_stories.py プロジェクト: GMHA/mediacloud
    def test_find_and_merge_dup_stories(self) -> None:
        """Test find_and_merge_dup_stories()."""
        db = self.db()

        topic = mediawords.test.db.create.create_test_topic(db, 'merge')
        medium = mediawords.test.db.create.create_test_medium(db, 'merge')
        feed = mediawords.test.db.create.create_test_feed(db,
                                                          'merge',
                                                          medium=medium)

        num_stories = 10
        stories = []
        for i in range(num_stories):
            story = mediawords.test.db.create.create_test_story(db,
                                                                "merge " +
                                                                str(i),
                                                                feed=feed)
            db.update_by_id('stories', story['stories_id'],
                            {'title': "long dup title foo bar baz"})
            mediawords.tm.stories.add_to_topic_stories(db, story, topic)
            stories.append(story)
            for j in range(i):
                db.query(
                    """
                    insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date)
                        select stories_id, %(b)s, 'foo bar', media_id, publish_date
                            from stories where stories_id = %(a)s
                    """, {
                        'a': story['stories_id'],
                        'b': j
                    })

        mediawords.tm.stories.find_and_merge_dup_stories(db, topic)

        stories_ids = [s['stories_id'] for s in stories]
        merged_stories = db.query(
            "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)",
            {
                'a': topic['topics_id'],
                'b': stories_ids
            }).flat()

        assert merged_stories == [stories_ids[-1]]
コード例 #6
0
    def test_get_story_match(self) -> None:
        """Test get_story_match()."""
        db = self.db()

        medium = mediawords.test.db.create.create_test_medium(db, 'foo')
        num_stories = 10
        stories = []
        for i in range(num_stories):
            story = db.create('stories', {
                'media_id': medium['media_id'],
                'url': ('http://stories-%d.com/foo/bar' % i),
                'guid': ('http://stories-%d.com/foo/bar/guid' % i),
                'title': ('story %d' % i),
                'publish_date': '2017-01-01'
            })
            stories.append(story)

        # None
        assert mediawords.tm.stories.get_story_match(db, 'http://foo.com') is None

        # straight and normalized versions of url and redirect_url
        assert mediawords.tm.stories.get_story_match(db, stories[0]['url']) == stories[0]
        assert mediawords.tm.stories.get_story_match(db, 'http://foo.com', stories[1]['url']) == stories[1]
        assert mediawords.tm.stories.get_story_match(db, stories[2]['url'] + '#foo') == stories[2]
        assert mediawords.tm.stories.get_story_match(db, 'http://foo.com', stories[3]['url'] + '#foo') == stories[3]

        # get_preferred_story - return only story with sentences
        db.query(
            """
            insert into story_sentences ( stories_id, media_id, publish_date, sentence, sentence_number )
                select stories_id, media_id, publish_date, 'foo', 1 from stories where stories_id = %(a)s
            """,
            {'a': stories[4]['stories_id']})
        stories = db.query("update stories set url = 'http://stories.com/' returning *").hashes()

        assert mediawords.tm.stories.get_story_match(db, 'http://stories.com/') == stories[4]