コード例 #1
0
    def test_fetch_topic_tweets(self) -> None:
        """Run fetch_topic_tweet tests with test database."""
        db = self.db()
        topic = mediawords.test.db.create.create_test_topic(db, 'test')

        topic = db.update_by_id('topics', topic['topics_id'], {'pattern': '.*'})

        test_dates = get_test_date_range()
        topic['start_date'] = test_dates[0]
        topic['end_date'] = test_dates[1]
        topic['ch_monitor_id'] = 123456
        db.update_by_id('topics', topic['topics_id'], topic)

        ftt.fetch_topic_tweets(db, topic['topics_id'], MockTwitter, MockCrimsonHexagon)

        topic_tweet_days = db.query("select * from topic_tweet_days").hashes()
        assert len(topic_tweet_days) == LOCAL_DATE_RANGE + 1

        start_date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d')
        test_days = [start_date + datetime.timedelta(days=x) for x in range(0, LOCAL_DATE_RANGE)]
        for d in test_days:
            topic_tweet_day = db.query(
                "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
                {'a': topic['topics_id'], 'b': d}
            ).hash()
            assert topic_tweet_day is not None

            validate_topic_tweets(db, topic_tweet_day)

        validate_topic_tweet_urls(db, topic)
コード例 #2
0
    def test_fetch_topic_tweets(self) -> None:
        """Run fetch_topic_tweet tests with test database."""
        db = self.db()
        topic = mediawords.test.db.create_test_topic(db, 'test')

        test_dates = get_test_date_range()
        topic['start_date'] = test_dates[0]
        topic['end_date'] = test_dates[1]
        topic['ch_monitor_id'] = 123456
        db.update_by_id('topics', topic['topics_id'], topic)

        mediawords.tm.fetch_topic_tweets.fetch_topic_tweets(
            db, topic['topics_id'], MockTwitter, MockCrimsonHexagon)

        topic_tweet_days = db.query("select * from topic_tweet_days").hashes()
        assert len(topic_tweet_days) == LOCAL_DATE_RANGE + 1

        start_date = datetime.datetime.strptime(topic['start_date'],
                                                '%Y-%m-%d')
        test_days = [
            start_date + datetime.timedelta(days=x)
            for x in range(0, LOCAL_DATE_RANGE)
        ]
        for d in test_days:
            topic_tweet_day = db.query(
                "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
                {
                    'a': topic['topics_id'],
                    'b': d
                }).hash()
            assert topic_tweet_day is not None

            validate_topic_tweets(db, topic_tweet_day)

        validate_topic_tweet_urls(db, topic)
コード例 #3
0
def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None:
    """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets."""
    topic_tweets = db.query(
        """
        select *
            from topic_tweets tt
                join topic_tweet_days ttd using (topic_tweet_days_id)
            where
                ttd.topics_id = %(a)s
        """,
        {'a': topic['topics_id']}).hashes()

    expected_num_urls = 0
    for topic_tweet in topic_tweets:
        data = dict(decode_json(topic_tweet['data']))
        expected_num_urls += len(data['tweet']['entities']['urls'])

    # first sanity check to make sure we got some urls
    num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0]
    assert num_urls == expected_num_urls

    total_json_urls = 0
    for topic_tweet in topic_tweets:

        ch_post = dict(decode_json(topic_tweet['data']))
        expected_urls = [x['expanded_url'] for x in ch_post['tweet']['entities']['urls']]
        total_json_urls += len(expected_urls)

        for expected_url in expected_urls:
            got_url = db.query("select * from topic_tweet_urls where url = %(a)s", {'a': expected_url}).hash()
            assert got_url is not None

    assert total_json_urls == num_urls
コード例 #4
0
    def test_fetch_topic_tweets(self) -> None:
        """Run fetch_topic_tweet tests with test database."""
        db = self.db()
        topic = mediawords.test.db.create.create_test_topic(db, 'test')

        topic = db.update_by_id('topics', topic['topics_id'],
                                {'pattern': '.*'})

        test_dates = get_test_date_range()
        topic['start_date'] = test_dates[0]
        topic['end_date'] = test_dates[1]
        db.update_by_id('topics', topic['topics_id'], topic)

        tsq = {
            'topics_id': topic['topics_id'],
            'platform': 'twitter',
            'source': 'crimson_hexagon',
            'query': 123456
        }
        db.create('topic_seed_queries', tsq)

        db.update_by_id('topics', topic['topics_id'], {'platform': 'twitter'})

        mediawords.tm.fetch_topic_tweets.fetch_meta_tweets_from_ch = mock_fetch_meta_tweets_from_ch
        mediawords.tm.fetch_topic_tweets.fetch_100_tweets = mock_fetch_100_tweets
        ftt.fetch_topic_tweets(db, topic['topics_id'])

        topic_tweet_days = db.query("select * from topic_tweet_days").hashes()
        assert len(topic_tweet_days) == LOCAL_DATE_RANGE + 1

        start_date = datetime.datetime.strptime(topic['start_date'],
                                                '%Y-%m-%d')
        test_days = [
            start_date + datetime.timedelta(days=x)
            for x in range(0, LOCAL_DATE_RANGE)
        ]
        for d in test_days:
            topic_tweet_day = db.query(
                "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
                {
                    'a': topic['topics_id'],
                    'b': d
                }).hash()
            assert topic_tweet_day is not None

            validate_topic_tweets(db, topic_tweet_day)

        validate_topic_tweet_urls(db, topic)
コード例 #5
0
    def test_remote_integration(self) -> None:
        """Run santity test on remote apis by calling the internal functions that integrate the CH and twitter data."""
        db = self.db()
        config = mediawords.util.config.get_config()

        topic = mediawords.test.db.create_test_topic(
            db, "test_remote_integration")
        topic['ch_monitor_id'] = config['crimson_hexagon']['test_monitor_id']
        db.update_by_id('topics', topic['topics_id'], topic)

        ttd = mediawords.tm.fetch_topic_tweets._add_topic_tweet_single_day(
            db, topic, datetime.datetime(year=2016, month=1, day=1),
            mediawords.tm.fetch_topic_tweets.CrimsonHexagon)

        max_tweets = 200
        mediawords.tm.fetch_topic_tweets._fetch_tweets_for_day(
            db,
            mediawords.tm.fetch_topic_tweets.Twitter,
            topic,
            ttd,
            max_tweets=max_tweets)

        got_tts = db.query(
            "select * from topic_tweets where topic_tweet_days_id = %(a)s", {
                'a': ttd['topic_tweet_days_id']
            }).hashes()

        # for old ch monitors, lots of the tweets may be deleted
        assert len(got_tts) > max_tweets / 10

        assert len(got_tts[0]['content']) > MIN_TEST_TWEET_LENGTH
        assert len(got_tts[0]['twitter_user']) > MIN_TEST_TWITTER_USER_LENGTH
コード例 #6
0
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None:
    """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data."""
    topic_tweets = db.query(
        "select * from topic_tweets where topic_tweet_days_id = %(a)s", {
            'a': topic_tweet_day['topic_tweet_days_id']
        }).hashes()

    # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets
    assert len(topic_tweets) > 0
    assert len(topic_tweets) == topic_tweet_day['num_ch_tweets']

    for topic_tweet in topic_tweets:
        tweet_data = dict(mediawords.util.json.decode_json(
            topic_tweet['data']))

        # random field that should be coming from twitter
        assert 'assignedCategoryId' in tweet_data

        expected_date = datetime.datetime.strptime(
            tweet_data['tweet']['created_at'], '%Y-%m-%d')
        got_date = datetime.datetime.strptime(topic_tweet['publish_date'],
                                              '%Y-%m-%d 00:00:00')
        assert got_date == expected_date

        assert topic_tweet['content'] == tweet_data['tweet']['text']
コード例 #7
0
def validate_topic_tweet_urls(db: DatabaseHandler, topic: dict) -> None:
    """Validate that topic_tweet_urls match what's in the tweet JSON data as saved in topic_tweets."""
    topic_tweets = db.query(
        """
        select *
            from topic_tweets tt
                join topic_tweet_days ttd using (topic_tweet_days_id)
            where
                ttd.topics_id = %(a)s
        """, {
            'a': topic['topics_id']
        }).hashes()

    expected_num_urls = 0
    for topic_tweet in topic_tweets:
        data = dict(mediawords.util.json.decode_json(topic_tweet['data']))
        expected_num_urls += len(data['tweet']['entities']['urls'])

    # first sanity check to make sure we got some urls
    num_urls = db.query("select count(*) from topic_tweet_urls").flat()[0]
    assert num_urls == expected_num_urls

    total_json_urls = 0
    for topic_tweet in topic_tweets:

        ch_post = dict(mediawords.util.json.decode_json(topic_tweet['data']))
        expected_urls = [
            x['expanded_url'] for x in ch_post['tweet']['entities']['urls']
        ]
        total_json_urls += len(expected_urls)

        for expected_url in expected_urls:
            got_url = db.query(
                "select * from topic_tweet_urls where url = %(a)s", {
                    'a': expected_url
                }).hash()
            assert got_url is not None

    assert total_json_urls == num_urls
コード例 #8
0
    def _test_remote_integration(self, source, query, day) -> None:
        """Run santity test on remote apis."""
        db = self.db()

        topic = mediawords.test.db.create.create_test_topic(
            db, "test_remote_integration")

        tsq = {
            'topics_id': topic['topics_id'],
            'platform': 'twitter',
            'source': source,
            'query': query
        }
        db.create('topic_seed_queries', tsq)

        topic['platform'] = 'twitter'
        topic['pattern'] = '.*'
        topic['start_date'] = day
        topic['end_date'] = day
        db.update_by_id('topics', topic['topics_id'], topic)

        # only fetch 200 tweets to make test quicker
        max_tweets = 200
        ftt.fetch_topic_tweets(db, topic['topics_id'], max_tweets)

        # ttd_day = datetime.datetime(year=2016, month=1, day=1)

        # meta_tweets = ftt.fetch_meta_tweets(db, topic, ttd_day)
        # ttd = ftt._add_topic_tweet_single_day(db, topic, len(meta_tweets), ttd_day)

        # max_tweets = 100
        # ftt._fetch_tweets_for_day(db, ttd, meta_tweets, max_tweets=max_tweets)

        got_tts = db.query("select * from topic_tweets").hashes()

        # for old ch monitors, lots of the tweets may be deleted
        assert len(got_tts) > max_tweets / 10

        assert len(got_tts[0]['content']) > MIN_TEST_TWEET_LENGTH
        assert len(got_tts[0]['twitter_user']) > MIN_TEST_TWITTER_USER_LENGTH
コード例 #9
0
def validate_topic_tweets(db: DatabaseHandler, topic_tweet_day: dict) -> None:
    """Validate that the topic tweets belonging to the given topic_tweet_day have all of the current data."""
    topic_tweets = db.query(
        "select * from topic_tweets where topic_tweet_days_id = %(a)s",
        {'a': topic_tweet_day['topic_tweet_days_id']}
    ).hashes()

    # fetch_topic_tweets should have set num_ch_tweets to the total number of tweets
    assert len(topic_tweets) > 0
    assert len(topic_tweets) == topic_tweet_day['num_ch_tweets']

    for topic_tweet in topic_tweets:
        tweet_data = dict(decode_json(topic_tweet['data']))

        # random field that should be coming from twitter
        assert 'assignedCategoryId' in tweet_data

        expected_date = datetime.datetime.strptime(tweet_data['tweet']['created_at'], '%Y-%m-%d')
        got_date = datetime.datetime.strptime(topic_tweet['publish_date'], '%Y-%m-%d 00:00:00')
        assert got_date == expected_date

        assert topic_tweet['content'] == tweet_data['tweet']['text']
コード例 #10
0
    def test_remote_integration(self) -> None:
        """Run santity test on remote apis by calling the internal functions that integrate the CH and twitter data."""
        db = self.db()

        topic = mediawords.test.db.create.create_test_topic(db, "test_remote_integration")
        topic['ch_monitor_id'] = TEST_MONITOR_ID
        db.update_by_id('topics', topic['topics_id'], topic)

        ttd_day = datetime.datetime(year=2016, month=1, day=1)
        ttd = ftt._add_topic_tweet_single_day(db, topic, ttd_day, ftt.CrimsonHexagon)

        max_tweets = 200
        ftt._fetch_tweets_for_day(db, ftt.Twitter, topic, ttd, max_tweets=max_tweets)

        got_tts = db.query(
            "select * from topic_tweets where topic_tweet_days_id = %(a)s",
            {'a': ttd['topic_tweet_days_id']}).hashes()

        # for old ch monitors, lots of the tweets may be deleted
        assert len(got_tts) > max_tweets / 10

        assert len(got_tts[0]['content']) > MIN_TEST_TWEET_LENGTH
        assert len(got_tts[0]['twitter_user']) > MIN_TEST_TWITTER_USER_LENGTH