def test_get_dup_story_groups():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    dup_story_groups = _get_dup_story_groups(db, topic)

    assert len(dup_story_groups) == 3

    for dsg in dup_story_groups:
        for story in dsg:
            assert dsg[0]['title'].lower() == story['title'].lower()
Exemple #2
0
def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s",
        {'a': topic['topics_id']}).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']
def validate_remote_integration(db: DatabaseHandler, source: str, query: str,
                                day: str) -> None:
    """Run sanity test on remote APIs."""

    topic = create_test_topic(db, "test_remote_integration")

    tsq = {
        'topics_id': topic['topics_id'],
        'platform': 'twitter',
        'source': source,
        'query': query
    }
    db.create('topic_seed_queries', tsq)

    topic['platform'] = 'twitter'
    topic['pattern'] = '.*'
    topic['start_date'] = day
    topic['end_date'] = day
    topic['mode'] = 'url_sharing'
    db.update_by_id('topics', topic['topics_id'], topic)

    fetch_topic_posts(db, topic['topics_id'])

    got_tts = db.query("select * from topic_posts").hashes()

    # for old ch monitors, lots of the posts may be deleted
    assert len(got_tts) > 20

    assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH
    assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
def test_get_failed_url():
    db = connect_to_db()

    topic = create_test_topic(db, 'foo')
    topics_id = topic['topics_id']

    tfus = [['http://story.added', FETCH_STATE_STORY_ADDED],
            ['http://story.matched', FETCH_STATE_STORY_MATCH],
            ['http://request.failed', FETCH_STATE_REQUEST_FAILED],
            ['http://content.match.failed', FETCH_STATE_CONTENT_MATCH_FAILED]]

    for tfu in tfus:
        db.create('topic_fetch_urls', {
            'topics_id': topic['topics_id'],
            'url': tfu[0],
            'state': tfu[1]
        })

    request_failed_tfu = _get_failed_url(db, topics_id,
                                         'http://request.failed')
    assert request_failed_tfu is not None
    assert request_failed_tfu['url'] == 'http://request.failed'

    content_failed_tfu = _get_failed_url(db, topics_id,
                                         'http://content.match.failed')
    assert content_failed_tfu is not None
    assert content_failed_tfu['url'] == 'http://content.match.failed'

    assert _get_failed_url(db, topics_id, 'http://story,added') is None
    assert _get_failed_url(db, topics_id, 'http://bogus.url') is None
    assert _get_failed_url(db, 0, 'http://request.failed') is None
Exemple #5
0
def test_fetch_topic_posts() -> None:
    """Run fetch_topic_post tests."""
    db = connect_to_db()

    topic = create_test_topic(db, 'test')

    topic['pattern'] = '.*'
    topic['platform'] = 'generic_post'
    topic['mode'] = 'url_sharing'
    topic['start_date'] = datetime.datetime.strptime(MOCK_START_DATE,
                                                     '%Y-%m-%d')
    topic['end_date'] = topic['start_date'] + datetime.timedelta(
        days=MOCK_DAYS - 1)

    db.update_by_id('topics', topic['topics_id'], topic)

    mock_posts = _get_mock_posts()

    mock_posts_csv = CSVStaticPostFetcher()._get_csv_string_from_dicts(
        mock_posts)

    tsq = {
        'topics_id': topic['topics_id'],
        'platform': 'generic_post',
        'source': 'csv',
        'ignore_pattern': 'ignore',
        'query': mock_posts_csv
    }
    tsq = db.create('topic_seed_queries', tsq)

    db.update_by_id('topics', topic['topics_id'], {'platform': 'generic_post'})

    fetch_topic_posts(db, tsq)

    topic_post_days = db.query("SELECT * FROM topic_post_days").hashes()
    assert len(topic_post_days) == MOCK_DAYS

    start_date = topic['start_date']
    test_days = [
        start_date + datetime.timedelta(days=x) for x in range(0, MOCK_DAYS)
    ]
    for d in test_days:
        topic_post_day = db.query(
            """
            SELECT *
            FROM topic_post_days
            WHERE
                topics_id = %(topics_id)s AND
                topic_seed_queries_id = %(topic_seed_queries_id)s AND
                day = %(day)s
            """, {
                'topics_id': tsq['topics_id'],
                'topic_seed_queries_id': tsq['topic_seed_queries_id'],
                'day': d,
            }).hash()
        assert topic_post_day is not None

    _validate_topic_posts(db, topic, mock_posts)

    _validate_topic_post_urls(db, mock_posts)
Exemple #6
0
def _add_timespans_to_stories(db: DatabaseHandler,
                              stories: List[Dict[str, Any]]) -> None:
    """Add timespans to stories for solr indexing."""
    stories = decode_object_from_bytes_if_needed(stories)

    topic = create_test_topic(db=db, label="solr dump test")

    snapshot = db.create(table='snapshots',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'snapshot_date': '2018-01-01',
                             'start_date': '2018-01-01',
                             'end_date': '2018-01-01',
                         })

    timespans = []
    for i in range(1, 5 + 1):
        timespan = db.create(table='timespans',
                             insert_hash={
                                 'topics_id': topic['topics_id'],
                                 'snapshots_id': snapshot['snapshots_id'],
                                 'start_date': '2018-01-01',
                                 'end_date': '2018-01-01',
                                 'story_count': 1,
                                 'story_link_count': 1,
                                 'medium_count': 1,
                                 'medium_link_count': 1,
                                 'post_count': 1,
                                 'period': 'overall',
                             })
        timespans.append(timespan)

    for story in stories:
        assert isinstance(story, dict)

        timespan = timespans.pop()
        timespans.insert(0, timespan)

        db.query(
            """
            INSERT INTO snap.story_link_counts (
                topics_id,
                timespans_id,
                stories_id,
                media_inlink_count,
                inlink_count,
                outlink_count
            ) VALUES (
                %(topics_id)s,
                %(timespans_id)s,
                %(stories_id)s,
                1,
                1,
                1
            )
        """, {
                'topics_id': timespan['topics_id'],
                'timespans_id': timespan['timespans_id'],
                'stories_id': story['stories_id'],
            })
def test_copy_story_to_new_medium_with_download_error():
    """Test copy_story_to_new_medium with an associated download error."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']})

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    new_download = db.query(
        "select * from downloads where stories_id = %(a)s",
        {'a': new_story['stories_id']}).hash()
    assert new_download is not None
    assert new_download['state'] == 'error'
Exemple #8
0
    def test_skip_self_links(self):
        """Test that self links are skipped within extract_links_for_topic_story"""

        story_domain = get_url_distinctive_domain(self.test_story['url'])

        topic = create_test_topic(self.db, 'links')
        self.db.create(
            'topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id']
            })

        num_links = MAX_SELF_LINKS * 2
        content = ''
        for i in range(num_links):
            plain_text = "Sample sentence to make sure the links get extracted" * 10
            url = "http://%s/%d" % (story_domain, i)
            paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text,
                                                                 url)
            content = content + paragraph

        store_content(self.db, self.test_download, content)

        extract_links_for_topic_story(db=self.db,
                                      stories_id=self.test_story['stories_id'],
                                      topics_id=topic['topics_id'])

        topic_links = self.db.query(
            "select * from topic_links where topics_id = %(a)s", {
                'a': topic['topics_id']
            }).hashes()

        assert (len(topic_links) == MAX_SELF_LINKS)
def test_merge_dup_media_story():
    """Test merge_dup_media_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)
    old_story = create_test_story(db=db, label='merge old', feed=feed)

    new_medium = create_test_medium(db, 'merge new')

    db.update_by_id('media', medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    cloned_story = merge_dup_media_story(db, topic, old_story)

    for field in 'url guid publish_date title'.split():
        assert cloned_story[field] == old_story[field]

    topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': cloned_story['stories_id'],
            'b': topic['topics_id']
        }).hash()
    assert topic_story is not None

    merged_story = merge_dup_media_story(db, topic, old_story)
    assert merged_story['stories_id'] == cloned_story['stories_id']
Exemple #10
0
def test_try_update_topic_link_ref_stories_id():
    """Test try_update_topic_link_ref_stories_id()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, label='foo', medium=medium)
    source_story = create_test_story(db, label='source story', feed=feed)
    target_story = create_test_story(db, label='target story a', feed=feed)

    topic = create_test_topic(db, 'foo')

    db.create('topic_stories', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id']})

    # first update should work
    topic_link_a = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_a = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_a)

    topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id'])

    assert topic_link_a['ref_stories_id'] == target_story['stories_id']

    # second one should silently fail
    topic_link_b = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_b = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_b)

    topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id'])

    assert topic_link_b['ref_stories_id'] is None

    # now generate an non-unique error and make sure we get an error
    bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'}

    with pytest.raises(McUpdateByIDException):
        try_update_topic_link_ref_stories_id(db, bogus_tfu)
Exemple #11
0
def test_add_missing_normalized_title_hashes():
    db = connect_to_db()

    topic = create_test_topic(db, 'titles')
    medium = create_test_medium(db, 'titles')
    feed = create_test_feed(db, 'titles', medium=medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "titles " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    # disable trigger so that we can actually set normalized_title_hash to null
    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    db.query("""
        WITH all_story_ids AS (
            SELECT stories_id
            FROM stories
        )
        UPDATE stories SET
            normalized_title_hash = NULL
        WHERE stories_id IN (
            SELECT stories_id
            FROM all_story_ids
        )
    """)

    db.query(
        "SELECT run_on_shards_or_raise('stories', %(command)s)",
        {
            'command': """
                -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title"
                BEGIN;
                LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE;
                ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title;
                COMMIT;
            """,
        }
    )

    assert __count_null_title_stories(db=db, topic=topic) == num_stories

    _add_missing_normalized_title_hashes(db, topic)

    assert __count_null_title_stories(db=db, topic=topic) == 0
Exemple #12
0
def test_add_tweet_story():
    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    medium = create_test_medium(db, 'test')
    feed = create_test_feed(db, 'test', medium)
    source_story = create_test_story(db, 'source', feed)

    topics_id = topic['topics_id']

    db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']})

    topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']}
    topic_link = db.create('topic_links', topic_link)

    tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']}
    tfu = db.create('topic_fetch_urls', tfu)

    tweet = {
        'id': 123,
        'text': 'add tweet story tweet text',
        'user': {'screen_name': 'tweet screen name'},
        'created_at': 'Mon Dec 13 23:21:48 +0000 2010',
        'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]},
        'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}},
        'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}}
    }

    story = _add_tweet_story(db, topic, tweet, [tfu])

    got_story = db.require_by_id('stories', story['stories_id'])

    assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text'])
    assert got_story['publish_date'][0:10] == '2010-12-13'
    assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id'])
    assert got_story['guid'] == story['url']

    got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id'])
    assert got_topic_link['ref_stories_id'] == story['stories_id']

    assert get_content_for_first_download(db, story) == tweet['text']

    got_topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {'a': story['stories_id'], 'b': topic['topics_id']}).hash()
    assert got_topic_story is not None
    assert got_topic_story['link_mined']

    # noinspection PyTypeChecker
    for url in [tweet['entities']['urls'][0]['expanded_url'],
                tweet['retweeted_status']['entities']['urls'][0]['expanded_url'],
                tweet['quoted_status']['entities']['urls'][0]['expanded_url']]:
        got_topic_link = db.query(
            "select * from topic_links where topics_id = %(a)s and url = %(b)s",
            {'a': topic['topics_id'], 'b': url}).hash()
        assert got_topic_link is not None
def test_copy_story_to_new_medium():
    """Test copy_story_to_new_medium."""
    db = connect_to_db()

    topic = create_test_topic(db, 'copy foo')

    new_medium = create_test_medium(db, 'copy new')

    old_medium = create_test_medium(db, 'copy old')
    old_feed = create_test_feed(db=db, label='copy old', medium=old_medium)
    old_story = create_test_story(db=db, label='copy old', feed=old_feed)

    add_content_to_test_story(db, old_story, old_feed)

    add_to_topic_stories(db, old_story, topic)

    new_story = copy_story_to_new_medium(db, topic, old_story, new_medium)

    assert db.find_by_id('stories', new_story['stories_id']) is not None

    for field in 'title url guid publish_date'.split():
        assert old_story[field] == new_story[field]

    topic_story_exists = db.query("""
        SELECT *
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = %(stories_id)s
    """, {
        'topics_id': topic['topics_id'],
        'stories_id': new_story['stories_id'],
    }).hash()
    assert topic_story_exists is not None

    new_download = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hash()
    assert new_download is not None

    content = fetch_content(db, new_download)
    assert content is not None and len(content) > 0

    story_sentences = db.query("""
        SELECT *
        FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {
        'stories_id': new_story['stories_id'],
    }).hashes()
    assert len(story_sentences) > 0
Exemple #14
0
    def setUp(self):
        super().setUp()

        self.db = connect_to_db()

        medium = create_test_medium(db=self.db, label='test')
        feed = create_test_feed(db=self.db, label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db,
                                      label='story-%d' % story_num,
                                      feed=feed)
            for sentence_number in range(
                    1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db.create(table='story_sentences',
                               insert_hash={
                                   'stories_id':
                                   story['stories_id'],
                                   'media_id':
                                   medium['media_id'],
                                   'publish_date':
                                   story['publish_date'],
                                   'sentence_number':
                                   sentence_number,
                                   'sentence':
                                   'story {}, sentence {}'.format(
                                       story['stories_id'], sentence_number),
                               })

        # Test topic
        topic = create_test_topic(db=self.db, label='test')
        self.topics_id = topic['topics_id']

        self.db.query(
            """
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db.query(
            """
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {
                'topics_id': self.topics_id
            }).flat()[0]

        self.db.query(
            """
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})
Exemple #15
0
def test_merge_dup_stories():
    """Test merge_dup_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)

    num_stories = 10
    stories = []
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        stories.append(story)
        for j in range(i):
            # noinspection SqlInsertValues
            db.query(
                """
                INSERT INTO story_sentences (
                    stories_id,
                    sentence_number,
                    sentence,
                    media_id,
                    publish_date
                )
                    SELECT
                        stories_id,
                        %(sentence_number)s AS sentence_number,
                        'foo bar' AS sentence,
                        media_id,
                        publish_date
                    FROM stories
                    WHERE stories_id = %(stories_id)s
            """, {
                    'stories_id': story['stories_id'],
                    'sentence_number': j,
                })

    _merge_dup_stories(db, topic, stories)

    stories_ids = [s['stories_id'] for s in stories]
    merged_stories = db.query(
        """
        SELECT stories_id
        FROM topic_stories
        WHERE
            topics_id = %(topics_id)s AND
            stories_id = ANY(%(stories_ids)s)
    """, {
            'topics_id': topic['topics_id'],
            'stories_ids': stories_ids,
        }).flat()

    assert merged_stories == [stories_ids[-1]]
def test_find_and_merge_dup_stories():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    find_and_merge_dup_stories(db, topic)

    num_topic_stories = db.query(
        """
        SELECT COUNT(*)
        FROM topic_stories
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).flat()[0]

    assert num_topic_stories == 3

    num_distinct_titles = db.query(
        """
        SELECT COUNT(DISTINCT normalized_title_hash)
        FROM snap.live_stories
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).flat()[0]

    assert num_distinct_titles == 3
def test_try_fetch_tweets_chunk_multiple():
    def _try_fetch_tweets_chunk_threaded(topic_: dict, tfus_: list) -> None:
        """Call ftu._try_fetch_tweets_chunk with a newly created db handle for thread safety."""
        db_ = connect_to_db()

        with requests_mock.Mocker() as m:
            m.get("https://api.twitter.com/1.1/statuses/lookup.json",
                  text=mock_statuses_lookup)
            _try_fetch_tweets_chunk(db_, topic_, tfus_)

    num_threads = 20

    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    topics_id = topic['topics_id']

    num_urls_per_thread = 100

    threads = []
    for j in range(num_threads):
        tfus = []
        for i in range(num_urls_per_thread):
            url = 'https://twitter.com/foo/status/%d' % i
            tfu = db.create('topic_fetch_urls', {
                'topics_id': topics_id,
                'url': url,
                'state': 'pending'
            })
            tfus.append(tfu)

        random.shuffle(tfus)

        t = threading.Thread(target=_try_fetch_tweets_chunk_threaded,
                             args=(topic, tfus))
        t.start()
        threads.append(t)

    [t.join() for t in threads]

    [num_topic_stories
     ] = db.query("select count(*) from topic_stories where topics_id = %(a)s",
                  {
                      'a': topics_id
                  }).flat()
    assert num_urls_per_thread == num_topic_stories
Exemple #18
0
def test_try_fetch_users_chunk_multiple():
    """Test fetch_100_users using mock. Run in parallel to test for race conditions."""
    def _try_fetch_users_chunk_parallel(topic_: dict, tfus_: list) -> None:
        db_ = connect_to_db()
        with requests_mock.Mocker() as m:
            m.post("https://api.twitter.com/1.1/users/lookup.json",
                   text=mock_users_lookup)
            _try_fetch_users_chunk(db_, topic_, tfus_)

    num_jobs = 20

    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    topics_id = topic['topics_id']

    num_urls_per_job = 100

    jobs = []
    for j in range(num_jobs):
        tfus = []
        for i in range(num_urls_per_job):
            url = f'https://twitter.com/test_user_{i}'
            tfu = db.create('topic_fetch_urls', {
                'topics_id': topics_id,
                'url': url,
                'state': 'pending'
            })
            tfus.append(tfu)

        random.shuffle(tfus)

        job = multiprocessing.Process(target=_try_fetch_users_chunk_parallel,
                                      args=(topic, tfus))
        job.start()
        jobs.append(job)

    [job.join() for job in jobs]

    [num_topic_stories] = db.query(
        "SELECT COUNT(*) FROM topic_stories WHERE topics_id = %(topics_id)s", {
            'topics_id': topics_id
        }).flat()
    assert num_urls_per_job == num_topic_stories
def test_merge_dup_media_stories():
    """Test merge_dup_media_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    old_medium = create_test_medium(db, 'merge from')
    new_medium = create_test_medium(db, 'merge to')
    feed = create_test_feed(db, 'merge', medium=old_medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    db.update_by_id('media', old_medium['media_id'],
                    {'dup_media_id': new_medium['media_id']})

    merge_dup_media_stories(db, topic)

    got_stories = db.query(
        """
        WITH found_topic_stories AS (
            SELECT stories_id
            FROM topic_stories
            WHERE topics_id = %(topics_id)s
        )

        SELECT *
        FROM stories
        WHERE stories_id IN (
            SELECT stories_id
            FROM found_topic_stories
        )
        """, {
            'topics_id': topic['topics_id']
        }).hashes()

    assert len(got_stories) == num_stories

    for got_story in got_stories:
        assert got_story['media_id'] == new_medium['media_id']
def test_call_function_on_url_chunk():
    """test _call_function_on_url_chunk."""
    _chunk_collector = []

    # noinspection PyUnusedLocal
    def _test_function(db_, topic_, urls_):
        _chunk_collector.append(urls_)

    # noinspection PyUnusedLocal
    def _error_function(db_, topic_, urls_):
        raise Exception('chunk exception')

    db = connect_to_db()
    topic = create_test_topic(db, 'test')

    urls = list(range(URLS_CHUNK_SIZE * 2))

    _call_function_on_url_chunks(db, topic, urls, _test_function)

    assert _chunk_collector == [
        urls[0:URLS_CHUNK_SIZE], urls[URLS_CHUNK_SIZE:]
    ]

    for i in range(URLS_CHUNK_SIZE * 2):
        db.create('topic_fetch_urls', {
            'topics_id': topic['topics_id'],
            'url': 'foo',
            'state': 'pending'
        })

    topic_fetch_urls = db.query("select * from topic_fetch_urls").hashes()

    _call_function_on_url_chunks(db, topic, topic_fetch_urls, _error_function)

    [error_count
     ] = db.query("select count(*) from topic_fetch_urls where state = %(a)s",
                  {
                      'a': FETCH_STATE_PYTHON_ERROR
                  }).flat()

    assert error_count == URLS_CHUNK_SIZE * 2
Exemple #21
0
def test_find_and_merge_dup_stories():
    db = connect_to_db()

    topic = create_test_topic(db, 'dupstories')
    medium = create_test_medium(db, 'dupstories')
    feed = create_test_feed(db, 'dupstories', medium=medium)

    num_stories = 9
    for i in range(num_stories):
        story = create_test_story(db, "dupstories " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        modi = i % 3
        divi = i // 3
        if modi == 0:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'TITLE ' + str(divi)})
        elif modi == 1:
            db.update_by_id('stories', story['stories_id'],
                            {'title': 'title ' + str(divi)})
        else:
            db.update_by_id('stories', story['stories_id'],
                            {'Title': 'title ' + str(divi)})

    find_and_merge_dup_stories(db, topic)

    num_topic_stories = db.query(
        "select count(*) from topic_stories where topics_id = %(a)s", {
            'a': topic['topics_id']
        }).flat()[0]

    assert num_topic_stories == 3

    num_distinct_titles = db.query(
        "select count(distinct normalized_title_hash) from snap.live_stories where topics_id = %(a)s",
        {
            'a': topic['topics_id']
        }).flat()[0]

    assert num_distinct_titles == 3
Exemple #22
0
def test_get_seeded_content():
    db = connect_to_db()

    topic = create_test_topic(db, 'foo')
    tfu = db.create(
        'topic_fetch_urls', {
            'topics_id': topic['topics_id'],
            'url': 'http://0.0.0.1/foo',
            'assume_match': True,
            'state': FETCH_STATE_PENDING
        })

    assert _get_seeded_content(db, tfu) is None

    tsu_content = '<title>seeded content</title>'
    db.create('topic_seed_urls', {
        'topics_id': topic['topics_id'],
        'url': tfu['url'],
        'content': tsu_content
    })

    response = _get_seeded_content(db, tfu)

    assert response.content == tsu_content
    assert response.code == 200
    assert response.last_requested_url == tfu['url']

    fetch_topic_url(db, tfu['topic_fetch_urls_id'], domain_timeout=0)

    tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id'])

    assert tfu['state'] == FETCH_STATE_STORY_ADDED
    assert tfu['code'] == 200
    assert tfu['stories_id'] is not None

    story = db.require_by_id('stories', tfu['stories_id'])

    assert story['title'] == 'seeded content'
def test_merge_dup_stories():
    """Test merge_dup_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'merge')
    medium = create_test_medium(db, 'merge')
    feed = create_test_feed(db, 'merge', medium=medium)

    num_stories = 10
    stories = []
    for i in range(num_stories):
        story = create_test_story(db, "merge " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)
        stories.append(story)
        for j in range(i):
            # noinspection SqlInsertValues
            db.query(
                """
                insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date)
                    select stories_id, %(b)s, 'foo bar', media_id, publish_date
                        from stories where stories_id = %(a)s
                """, {
                    'a': story['stories_id'],
                    'b': j
                })

    _merge_dup_stories(db, topic, stories)

    stories_ids = [s['stories_id'] for s in stories]
    merged_stories = db.query(
        "select stories_id from topic_stories where topics_id = %(a)s and stories_id = any(%(b)s)",
        {
            'a': topic['topics_id'],
            'b': stories_ids
        }).flat()

    assert merged_stories == [stories_ids[-1]]
    def setUp(self):
        super().setUp()

        medium = create_test_medium(db=self.db(), label='test')
        feed = create_test_feed(db=self.db(), label='feed', medium=medium)

        for story_num in range(self.TEST_STORY_COUNT):
            story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed)
            for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1):
                self.db().create(table='story_sentences', insert_hash={
                    'stories_id': story['stories_id'],
                    'media_id': medium['media_id'],
                    'publish_date': story['publish_date'],
                    'sentence_number': sentence_number,
                    'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number),
                })

        # Test topic
        topic = create_test_topic(db=self.db(), label='test')
        self.topics_id = topic['topics_id']

        self.db().query("""
            INSERT INTO topic_stories (topics_id, stories_id)
            SELECT %(topics_id)s, stories_id FROM stories
        """, {'topics_id': self.topics_id})

        # Test snapshot
        self.snapshots_id = self.db().query("""
            INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date)
            VALUES (%(topics_id)s, NOW(), NOW(), NOW())
            RETURNING snapshots_id
        """, {'topics_id': self.topics_id}).flat()[0]

        self.db().query("""
            INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date)
            SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories
        """, {'snapshots_id': self.snapshots_id})
def test_add_missing_normalized_title_hashes():
    db = connect_to_db()

    topic = create_test_topic(db, 'titles')
    medium = create_test_medium(db, 'titles')
    feed = create_test_feed(db, 'titles', medium=medium)

    num_stories = 10
    for i in range(num_stories):
        story = create_test_story(db, "titles " + str(i), feed=feed)
        add_to_topic_stories(db, story, topic)

    # disable trigger so that we can actually set normalized_title_hash to null
    db.query(
        "alter table stories disable trigger stories_add_normalized_title")
    # noinspection SqlWithoutWhere
    db.query("update stories set normalized_title_hash = null")
    db.query("alter table stories enable trigger stories_add_normalized_title")

    assert __count_null_title_stories(db=db, topic=topic) == num_stories

    _add_missing_normalized_title_hashes(db, topic)

    assert __count_null_title_stories(db=db, topic=topic) == 0
def test_merge_foreign_rss_stories():
    """Test merge_foreign_rss_stories()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'foo')

    medium = create_test_medium(db, 'norss')
    feed = create_test_feed(db=db, label='norss', medium=medium)
    num_stories = 10
    stories = [
        create_test_story(db=db, label=str(i), feed=feed)
        for i in range(num_stories)
    ]

    rss_medium = create_test_medium(db, 'rss')
    rss_medium = db.query(
        """
        UPDATE media SET
            foreign_rss_links = 't'
        WHERE media_id = %(media_id)s
        RETURNING *
    """, {
            'media_id': rss_medium['media_id'],
        }).hash()
    rss_feed = create_test_feed(db=db, label='rss', medium=rss_medium)
    num_rss_stories = 10
    rss_stories = []
    for i in range(num_rss_stories):
        story = create_test_story(db=db, label=str(i), feed=rss_feed)
        download = db.create(
            'downloads', {
                'stories_id': story['stories_id'],
                'feeds_id': rss_feed['feeds_id'],
                'url': story['url'],
                'host': 'foo',
                'type': 'content',
                'state': 'success',
                'priority': 0,
                'sequence': 0,
                'path': 'postgresql'
            })
        store_content(db, download, story['title'])
        rss_stories.append(story)

    # noinspection SqlInsertValues
    db.query(
        """
        INSERT INTO topic_stories (
            stories_id,
            topics_id
        )
            SELECT
                stories_id,
                %(topics_id)s AS topics_id
            FROM stories
    """, {
            'topics_id': int(topic['topics_id']),
        })

    assert db.query("SELECT COUNT(*) FROM topic_stories").flat(
    )[0] == num_stories + num_rss_stories

    merge_foreign_rss_stories(db, topic)

    assert db.query(
        "SELECT COUNT(*) FROM topic_stories").flat()[0] == num_stories
    assert db.query(
        "SELECT COUNT(*) FROM topic_seed_urls").flat()[0] == num_rss_stories

    got_topic_stories_ids = db.query(
        "SELECT stories_id FROM topic_stories").flat()
    expected_topic_stories_ids = [s['stories_id'] for s in stories]
    assert sorted(got_topic_stories_ids) == sorted(expected_topic_stories_ids)

    got_seed_urls = db.query(
        """
        SELECT
            topics_id,
            url,
            content
        FROM topic_seed_urls
        WHERE topics_id = %(topics_id)s
    """, {
            'topics_id': topic['topics_id'],
        }).hashes()
    expected_seed_urls = \
        [{'url': s['url'], 'topics_id': topic['topics_id'], 'content': s['title']} for s in rss_stories]

    assert sorted(got_seed_urls,
                  key=itemgetter('url')) == sorted(expected_seed_urls,
                                                   key=itemgetter('url'))
    def test_extract_links_for_topic_story(self) -> None:
        """Test extract_links_for_topic_story()."""

        self.test_story['description'] = 'http://foo.com http://bar.com'
        self.db.update_by_id('stories', self.test_story['stories_id'],
                             self.test_story)

        topic = create_test_topic(self.db, 'links')
        self.db.create(
            'topic_stories', {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id']
            })

        extract_links_for_topic_story(
            db=self.db,
            stories_id=self.test_story['stories_id'],
            topics_id=topic['topics_id'],
        )

        got_topic_links = self.db.query(
            """
            SELECT
                topics_id,
                stories_id,
                url
            FROM topic_links
            WHERE topics_id = %(topics_id)s
            ORDER BY url
        """, {
                'topics_id': topic['topics_id'],
            }).hashes()

        expected_topic_links = [{
            'topics_id': topic['topics_id'],
            'stories_id': self.test_story['stories_id'],
            'url': 'http://bar.com'
        }, {
            'topics_id': topic['topics_id'],
            'stories_id': self.test_story['stories_id'],
            'url': 'http://foo.com'
        }]

        assert got_topic_links == expected_topic_links

        got_topic_story = self.db.query(
            """
            SELECT
                topics_id,
                stories_id,
                link_mined
            FROM topic_stories
            WHERE
                topics_id = %(topics_id)s AND
                stories_id = %(stories_id)s
        """, {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id'],
            }).hash()

        expected_topic_story = {
            'topics_id': topic['topics_id'],
            'stories_id': self.test_story['stories_id'],
            'link_mined': True,
        }

        assert got_topic_story == expected_topic_story

        # generate an error and make sure that it gets saved to topic_stories
        del self.test_story['url']
        extract_links_for_topic_story(
            db=self.db,
            stories_id=self.test_story['stories_id'],
            topics_id=topic['topics_id'],
            test_throw_exception=True,
        )

        got_topic_story = self.db.query(
            """
            SELECT
                topics_id,
                stories_id,
                link_mined,
                link_mine_error
            FROM topic_stories
            WHERE
                topics_id = %(topics_id)s AND
                stories_id = %(stories_id)s
        """, {
                'topics_id': topic['topics_id'],
                'stories_id': self.test_story['stories_id'],
            }).hash()

        assert "McExtractLinksForTopicStoryTestException" in got_topic_story[
            'link_mine_error']
        assert got_topic_story['link_mined']
    def test_get_topic_url_variants(self):
        media = create_test_story_stack(db=self.db(),
                                        data={
                                            'A': {
                                                'B': [1, 2, 3],
                                                'C': [4, 5, 6],
                                            },
                                            'D': {
                                                'E': [7, 8, 9],
                                            }
                                        })

        story_1 = media['A']['feeds']['B']['stories']['1']
        story_2 = media['A']['feeds']['B']['stories']['2']
        story_3 = media['A']['feeds']['B']['stories']['3']
        story_4 = media['A']['feeds']['C']['stories']['4']

        self.db().query(
            """
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
                'source_stories_id': story_2['stories_id'],
                'target_stories_id': story_1['stories_id'],
            })

        self.db().query(
            """
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
                'source_stories_id': story_3['stories_id'],
                'target_stories_id': story_2['stories_id'],
            })

        self.db().create(
            table='tag_sets',
            insert_hash={'name': 'foo'},
        )

        topic = create_test_topic(db=self.db(), label='foo')

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_1['stories_id'],
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_1['stories_id'],
                             'url': story_1['url'],
                             'redirect_url': story_1['url'] + "/redirect_url",
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_2['stories_id'],
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_2['stories_id'],
                             'url': story_2['url'],
                             'redirect_url': story_2['url'] + "/redirect_url",
                         })

        self.db().create(table='topic_stories',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_3['stories_id']
                         })

        self.db().create(table='topic_links',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'stories_id': story_4['stories_id'],
                             'ref_stories_id': story_3['stories_id'],
                             'url': story_3['url'] + '/alternate',
                         })

        test_url = story_1['url'] + self.CRUFT

        expected_urls = {
            story_1['url'],
            story_1['url'] + self.CRUFT,
            story_2['url'],
            story_1['url'] + "/redirect_url",
            story_2['url'] + "/redirect_url",
            story_3['url'],
            story_3['url'] + "/alternate",
        }

        url_variants = all_url_variants(db=self.db(), url=test_url)

        assert len(expected_urls) == len(url_variants)

        sorted_expected_urls = sorted(expected_urls)
        sorted_url_variants = sorted(url_variants)

        for i in range(len(sorted_expected_urls)):
            assert urls_are_equal(url1=sorted_expected_urls[i],
                                  url2=sorted_url_variants[i])
Exemple #29
0
    def setUp(self):
        self.db = connect_to_db()

        db = self.db

        self.connected_media = []
        for i in range(self.__NUM_CONNECTED_MEDIA):
            self.connected_media.append(
                create_test_medium(db, 'connected %d' % i))

        self.disconnected_media = []
        for i in range(self.__NUM_DISCONNECTED_MEDIA):
            self.disconnected_media.append(
                create_test_medium(db, 'disconnected %d' % i))

        self.all_media = self.connected_media + self.disconnected_media

        self.topic = create_test_topic(db, 'foo')
        self.timespan = create_test_timespan(db, self.topic)

        center_medium = self.connected_media[0]
        for medium in self.connected_media[1:]:
            db.query(
                """
                    INSERT INTO snap.medium_links (
                        topics_id,
                        timespans_id,
                        source_media_id,
                        ref_media_id,
                        link_count
                    ) VALUES (
                        %(topics_id)s,
                        %(timespans_id)s,
                        %(source_media_id)s,
                        %(ref_media_id)s,
                        1
                    )
                """, {
                    'topics_id': self.topic['topics_id'],
                    'timespans_id': self.timespan['timespans_id'],
                    'source_media_id': medium['media_id'],
                    'ref_media_id': center_medium['media_id'],
                })

        db.query("""
                INSERT INTO snap.medium_link_counts (
                    topics_id,
                    timespans_id,
                    media_id,
                    media_inlink_count,
                    outlink_count,
                    story_count,
                    inlink_count,
                    sum_media_inlink_count
                )
                    SELECT
                        topics_id,
                        timespans_id,
                        media_id,
                        media_id,
                        1,
                        1,
                        1,
                        1
                    FROM timespans AS t
                        CROSS JOIN media AS m
            """)

        tag_set = db.find_or_create(
            'tag_sets', {'name': 'retweet_partisanship_2016_count_10'})
        tag = db.find_or_create('tags', {
            'tag_sets_id': tag_set['tag_sets_id'],
            'tag': 'right'
        })
        db.find_or_create('color_sets', {
            'color': 'bb0404',
            'color_set': 'partisan_retweet',
            'id': 'right'
        })
        db.find_or_create('color_sets', {
            'color': '',
            'color_set': 'partisan_retweet',
            'id': 'right'
        })

        db.query(
            "INSERT INTO media_tags_map (media_id, tags_id) SELECT media_id, %(a)s FROM media",
            {'a': tag['tags_id']})
    def test_get_topic_url_variants(self):
        media = create_test_story_stack(
            db=self.db(),
            data={
                'A': {
                    'B': [1, 2, 3],
                    'C': [4, 5, 6],
                },
                'D': {
                    'E': [7, 8, 9],
                }
            }
        )

        story_1 = media['A']['feeds']['B']['stories']['1']
        story_2 = media['A']['feeds']['B']['stories']['2']
        story_3 = media['A']['feeds']['B']['stories']['3']
        story_4 = media['A']['feeds']['C']['stories']['4']

        self.db().query("""
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
            'source_stories_id': story_2['stories_id'],
            'target_stories_id': story_1['stories_id'],
        })

        self.db().query("""
            INSERT INTO topic_merged_stories_map (source_stories_id, target_stories_id)
            VALUES (%(source_stories_id)s, %(target_stories_id)s)
        """, {
            'source_stories_id': story_3['stories_id'],
            'target_stories_id': story_2['stories_id'],
        })

        self.db().create(
            table='tag_sets',
            insert_hash={'name': 'foo'},
        )

        topic = create_test_topic(db=self.db(), label='foo')

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_1['stories_id'],
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_1['stories_id'],
                'url': story_1['url'],
                'redirect_url': story_1['url'] + "/redirect_url",
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_2['stories_id'],
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_2['stories_id'],
                'url': story_2['url'],
                'redirect_url': story_2['url'] + "/redirect_url",
            }
        )

        self.db().create(
            table='topic_stories',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_3['stories_id']
            }
        )

        self.db().create(
            table='topic_links',
            insert_hash={
                'topics_id': topic['topics_id'],
                'stories_id': story_4['stories_id'],
                'ref_stories_id': story_3['stories_id'],
                'url': story_3['url'] + '/alternate',
            }
        )

        test_url = story_1['url'] + self.CRUFT

        expected_urls = {
            story_1['url'],
            story_1['url'] + self.CRUFT,
            story_2['url'],
            story_1['url'] + "/redirect_url",
            story_2['url'] + "/redirect_url",
            story_3['url'],
            story_3['url'] + "/alternate",
        }

        url_variants = all_url_variants(db=self.db(), url=test_url)

        assert len(expected_urls) == len(url_variants)

        sorted_expected_urls = sorted(expected_urls)
        sorted_url_variants = sorted(url_variants)

        for i in range(len(sorted_expected_urls)):
            assert urls_are_equal(url1=sorted_expected_urls[i], url2=sorted_url_variants[i])
Exemple #31
0
def test_add_user_story():
    """Test _add_user_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    medium = create_test_medium(db, 'test')
    feed = create_test_feed(db, 'test', medium)
    source_story = create_test_story(db, 'source', feed)

    topics_id = topic['topics_id']

    db.create('topic_stories', {
        'topics_id': topics_id,
        'stories_id': source_story['stories_id']
    })

    topic_link = db.create(
        'topic_links', {
            'topics_id': topics_id,
            'url': 'u',
            'stories_id': source_story['stories_id'],
        })

    tfu = db.create(
        'topic_fetch_urls', {
            'topics_id': topics_id,
            'url': 'u',
            'state': 'pending',
            'topic_links_id': topic_link['topic_links_id'],
        })

    user = {
        'id': 123,
        'screen_name': 'test_screen_name',
        'name': 'test screen name',
        'description': 'test user description'
    }

    story = _add_user_story(db, topic, user, [tfu])

    got_story = db.require_by_id('stories', story['stories_id'])

    assert got_story[
        'title'] == f"{user['name']} ({user['screen_name']}) | Twitter"
    assert got_story['url'] == f"https://twitter.com/{user['screen_name']}"

    got_topic_link = db.require_by_id('topic_links',
                                      topic_link['topic_links_id'])
    assert got_topic_link['ref_stories_id'] == story['stories_id']

    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    assert get_content_for_first_download(db, story) == content

    got_topic_story = db.query(
        """
        SELECT *
        FROM topic_stories
        WHERE
            stories_id = %(stories_id)s AND
            topics_id = %(topics_id)s
    """, {
            'stories_id': story['stories_id'],
            'topics_id': topic['topics_id'],
        }).hash()
    assert got_topic_story is not None
    assert got_topic_story['link_mined']

    got_undateable_tag = db.query(
        """
        SELECT *
        FROM stories_tags_map AS stm
            INNER JOIN tags AS t USING (tags_id)
            INNER JOIN tag_sets USING (tag_sets_id)
        WHERE
            stories_id = %(stories_id)s AND
            tag = 'undateable' AND
            name = 'date_invalid'
    """, {
            'stories_id': got_story['stories_id']
        }).hash()

    assert got_undateable_tag
Exemple #32
0
def test_add_user_story():
    """Test _add_user_story()."""
    db = connect_to_db()

    topic = create_test_topic(db, 'test')
    medium = create_test_medium(db, 'test')
    feed = create_test_feed(db, 'test', medium)
    source_story = create_test_story(db, 'source', feed)

    topics_id = topic['topics_id']

    db.create('topic_stories', {
        'topics_id': topics_id,
        'stories_id': source_story['stories_id']
    })

    topic_link = {
        'topics_id': topics_id,
        'url': 'u',
        'stories_id': source_story['stories_id']
    }
    topic_link = db.create('topic_links', topic_link)

    tfu = {
        'topics_id': topics_id,
        'url': 'u',
        'state': 'pending',
        'topic_links_id': topic_link['topic_links_id']
    }
    tfu = db.create('topic_fetch_urls', tfu)

    user = {
        'id': 123,
        'screen_name': 'test_screen_name',
        'name': 'test screen name',
        'description': 'test user description'
    }

    story = _add_user_story(db, topic, user, [tfu])

    got_story = db.require_by_id('stories', story['stories_id'])

    assert got_story['title'] == "%s (%s) | Twitter" % (user['name'],
                                                        user['screen_name'])
    assert got_story['url'] == 'https://twitter.com/%s' % (user['screen_name'])

    got_topic_link = db.require_by_id('topic_links',
                                      topic_link['topic_links_id'])
    assert got_topic_link['ref_stories_id'] == story['stories_id']

    content = '%s (%s): %s' % (user['name'], user['screen_name'],
                               user['description'])
    assert get_content_for_first_download(db, story) == content

    got_topic_story = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': story['stories_id'],
            'b': topic['topics_id']
        }).hash()
    assert got_topic_story is not None
    assert got_topic_story['link_mined']

    got_undateable_tag = db.query(
        """
        select *
            from stories_tags_map stm
                join tags t using (tags_id)
                join tag_sets using(tag_sets_id)
            where
                stories_id = %(a)s and
                tag = 'undateable' and
                name = 'date_invalid'
        """, {
            'a': got_story['stories_id']
        }).hash()

    assert got_undateable_tag