def test_copy_story_to_new_medium_with_download_error(): """Test copy_story_to_new_medium with an associated download error.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) db.query("update downloads set state = 'error' where stories_id = %(a)s", {'a': old_story['stories_id']}) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None new_download = db.query( "select * from downloads where stories_id = %(a)s", {'a': new_story['stories_id']}).hash() assert new_download is not None assert new_download['state'] == 'error'
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( "select s.* from stories s join topic_stories ts using (stories_id) where topics_id = %(a)s", {'a': topic['topics_id']}).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def test_merge_dup_media_story(): """Test merge_dup_media_story().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) old_story = create_test_story(db=db, label='merge old', feed=feed) new_medium = create_test_medium(db, 'merge new') db.update_by_id('media', medium['media_id'], {'dup_media_id': new_medium['media_id']}) cloned_story = merge_dup_media_story(db, topic, old_story) for field in 'url guid publish_date title'.split(): assert cloned_story[field] == old_story[field] topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': cloned_story['stories_id'], 'b': topic['topics_id'] }).hash() assert topic_story is not None merged_story = merge_dup_media_story(db, topic, old_story) assert merged_story['stories_id'] == cloned_story['stories_id']
def test_copy_story_to_new_medium(): """Test copy_story_to_new_medium.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None for field in 'title url guid publish_date'.split(): assert old_story[field] == new_story[field] topic_story_exists = db.query(""" SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], }).hash() assert topic_story_exists is not None new_download = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hash() assert new_download is not None content = fetch_content(db, new_download) assert content is not None and len(content) > 0 story_sentences = db.query(""" SELECT * FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hashes() assert len(story_sentences) > 0
def test_get_preferred_story(): """Test get_preferred_story().""" db = connect_to_db() num_media = 5 media = [] for i in range(num_media): medium = create_test_medium(db, "foo " + str(i)) feed = create_test_feed(db=db, label="foo", medium=medium) story = create_test_story(db=db, label="foo", feed=feed) medium['story'] = story media.append(medium) # first prefer medium pointed to by dup_media_id of another story preferred_medium = media[1] db.query("update media set dup_media_id = %(a)s where media_id = %(b)s", { 'a': preferred_medium['media_id'], 'b': media[0]['media_id'] }) stories = [m['story'] for m in media] assert get_preferred_story(db, stories) == preferred_medium['story'] # next prefer any medium without a dup_media_id preferred_medium = media[num_media - 1] # noinspection SqlWithoutWhere db.query("update media set dup_media_id = null") db.query("update media set dup_media_id = %(a)s where media_id != %(a)s", {'a': media[0]['media_id']}) db.query("update media set dup_media_id = null where media_id = %(a)s", {'a': preferred_medium['media_id']}) stories = [m['story'] for m in media[1:]] assert get_preferred_story(db, stories) == preferred_medium['story'] # next prefer the medium whose story url matches the medium domain # noinspection SqlWithoutWhere db.query("update media set dup_media_id = null") # noinspection SqlWithoutWhere db.query("update media set url='http://media-'||media_id||'.com'") # noinspection SqlWithoutWhere db.query("update stories set url='http://stories-'||stories_id||'.com'") preferred_medium = media[2] db.query( "update stories set url = 'http://media-'||media_id||'.com' where media_id = %(a)s", {'a': preferred_medium['media_id']}) stories = db.query("select * from stories").hashes() preferred_story = db.query("select * from stories where media_id = %(a)s", { 'a': preferred_medium['media_id'] }).hash() assert get_preferred_story(db, stories) == preferred_story # next prefer lowest media_id # noinspection SqlWithoutWhere db.query("update stories set url='http://stories-'||stories_id||'.com'") stories = db.query("select * from stories").hashes() assert get_preferred_story( db, stories)['stories_id'] == media[0]['story']['stories_id']
def setUpClass(cls) -> None: # All tests should be able to use the same database cls._DB = connect_to_db() cls._TEST_MEDIUM = create_test_medium(db=cls._DB, label='test') cls._TEST_FEED = create_test_feed(db=cls._DB, label='test', medium=cls._TEST_MEDIUM)
def test_add_story_description_unset(): """Test adding a story without a description being set.""" db = connect_to_db() medium = create_test_medium(db=db, label='test') feed = create_test_feed(db=db, label='test', medium=medium) story = { 'url': 'http://test', 'guid': 'http://test', 'media_id': medium['media_id'], 'title': "test", # stories.description can be NULL so it's a valid value: 'description': None, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', } add_story(db=db, story=story, feeds_id=feed['feeds_id']) assert len(db.select(table='stories', what_to_select='*').hashes()) == 1 assert len( db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1
def test_normalized_urls_out_of_date(): """Test _normalized_urls_out_of_date().""" db = connect_to_db() assert not _normalized_urls_out_of_date(db) [create_test_medium(db, str(i)) for i in range(5)] assert _normalized_urls_out_of_date(db) # noinspection SqlWithoutWhere db.query("update media set normalized_url = url") assert not _normalized_urls_out_of_date(db) db.query( "update media set normalized_url = null where media_id in ( select media_id from media limit 1 )" ) assert _normalized_urls_out_of_date(db) # noinspection SqlWithoutWhere db.query("update media set normalized_url = url") assert not _normalized_urls_out_of_date(db)
def setUp(self) -> None: super().setUp() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story)
def test_get_dup_story_groups(): db = connect_to_db() topic = create_test_topic(db, 'dupstories') medium = create_test_medium(db, 'dupstories') feed = create_test_feed(db, 'dupstories', medium=medium) num_stories = 9 for i in range(num_stories): story = create_test_story(db, "dupstories " + str(i), feed=feed) add_to_topic_stories(db, story, topic) modi = i % 3 divi = i // 3 if modi == 0: db.update_by_id('stories', story['stories_id'], {'title': 'TITLE ' + str(divi)}) elif modi == 1: db.update_by_id('stories', story['stories_id'], {'title': 'title ' + str(divi)}) else: db.update_by_id('stories', story['stories_id'], {'Title': 'title ' + str(divi)}) dup_story_groups = _get_dup_story_groups(db, topic) assert len(dup_story_groups) == 3 for dsg in dup_story_groups: for story in dsg: assert dsg[0]['title'].lower() == story['title'].lower()
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.config = mediawords.util.config.get_config() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db(), feed=self.test_feed, story=self.test_story) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download) mediawords.dbi.downloads.store_content(self.db(), self.test_download, self.__TEST_CONTENT) self.save_config = copy.deepcopy(self.config)
def test_provide_download_ids() -> None: db = connect_to_db() medium = create_test_medium(db, 'foo') feed = create_test_feed(db, 'foo', medium=medium) hosts = ('foo.bar', 'bar.bat', 'bat.baz') downloads_per_host = 3 for host in hosts: for i in range(downloads_per_host): download = { 'feeds_id': feed['feeds_id'], 'state': 'pending', 'priority': 1, 'sequence': 1, 'type': 'content', 'url': 'http://' + host + '/' + str(i), 'host': host} db.create('downloads', download) download_ids = provide_download_ids(db) # +1 for the test feed assert len(download_ids) == len(hosts) + 1
def test_get_story_with_most_sentences(): """Test _get_story_with_most_sentences().""" db = connect_to_db() medium = create_test_medium(db, "foo") feed = create_test_feed(db=db, label="foo", medium=medium) num_filled_stories = 5 stories = [] for i in range(num_filled_stories): story = create_test_story(db=db, label="foo" + str(i), feed=feed) stories.append(story) for n in range(1, i + 1): db.create( 'story_sentences', { 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'sentence': 'foo', 'sentence_number': n, 'publish_date': story['publish_date'] }) empty_stories = [] for i in range(2): story = create_test_story(db=db, label="foo empty" + str(i), feed=feed) empty_stories.append(story) stories.append(story) assert _get_story_with_most_sentences( db, stories) == stories[num_filled_stories - 1] assert _get_story_with_most_sentences( db, [empty_stories[0]]) == empty_stories[0] assert _get_story_with_most_sentences(db, empty_stories) == empty_stories[0]
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME) self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium) self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed)
def test_try_update_topic_link_ref_stories_id(): """Test try_update_topic_link_ref_stories_id().""" db = connect_to_db() medium = create_test_medium(db, 'foo') feed = create_test_feed(db, label='foo', medium=medium) source_story = create_test_story(db, label='source story', feed=feed) target_story = create_test_story(db, label='target story a', feed=feed) topic = create_test_topic(db, 'foo') db.create('topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id']}) # first update should work topic_link_a = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_a = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_a) topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id']) assert topic_link_a['ref_stories_id'] == target_story['stories_id'] # second one should silently fail topic_link_b = db.create('topic_links', { 'topics_id': topic['topics_id'], 'stories_id': source_story['stories_id'], 'url': 'http://foo.com'}) topic_fetch_url_b = db.create('topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': 'http://foo.com', 'topic_links_id': topic_link_a['topic_links_id'], 'state': FETCH_STATE_STORY_ADDED, 'stories_id': target_story['stories_id']}) try_update_topic_link_ref_stories_id(db, topic_fetch_url_b) topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id']) assert topic_link_b['ref_stories_id'] is None # now generate an non-unique error and make sure we get an error bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'} with pytest.raises(McUpdateByIDException): try_update_topic_link_ref_stories_id(db, bogus_tfu)
def test_add_missing_normalized_title_hashes(): db = connect_to_db() topic = create_test_topic(db, 'titles') medium = create_test_medium(db, 'titles') feed = create_test_feed(db, 'titles', medium=medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "titles " + str(i), feed=feed) add_to_topic_stories(db, story, topic) # disable trigger so that we can actually set normalized_title_hash to null db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s DISABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) db.query(""" WITH all_story_ids AS ( SELECT stories_id FROM stories ) UPDATE stories SET normalized_title_hash = NULL WHERE stories_id IN ( SELECT stories_id FROM all_story_ids ) """) db.query( "SELECT run_on_shards_or_raise('stories', %(command)s)", { 'command': """ -- noinspection SqlResolveForFile @ trigger/"stories_add_normalized_title" BEGIN; LOCK TABLE pg_proc IN ACCESS EXCLUSIVE MODE; ALTER TABLE %s ENABLE TRIGGER stories_add_normalized_title; COMMIT; """, } ) assert __count_null_title_stories(db=db, topic=topic) == num_stories _add_missing_normalized_title_hashes(db, topic) assert __count_null_title_stories(db=db, topic=topic) == 0
def test_guess_medium() -> None: """Test guess_medium().""" db = connect_to_db() num_media = 5 [create_test_medium(db, str(i)) for i in range(num_media)] # the default test media do not have unique domains # noinspection SqlWithoutWhere db.query("update media set url = 'http://media-' || media_id ||'.com'") # dummy guess_medium call to assign normalized_urls guess_medium(db, 'foo') media = db.query("select * from media order by media_id").hashes() # basic lookup of existing media assert guess_medium(db, media[0]['url']) == media[0] assert guess_medium(db, media[1]['url'] + '/foo/bar/') == media[1] assert guess_medium(db, media[2]['url'] + URL_SPIDERED_SUFFIX) == media[2] # create a new medium new_medium_story_url = 'http://new-medium.com/with/path' new_medium = guess_medium(db, new_medium_story_url) assert new_medium['name'] == 'new-medium.com' assert new_medium['url'] == 'http://new-medium.com/' spidered_tag = get_spidered_tag(db) spidered_mtm = db.query( "select * from media_tags_map where tags_id = %(a)s and media_id = %(b)s", {'a': spidered_tag['tags_id'], 'b': new_medium['media_id']}) assert spidered_mtm is not None # find the url with some url varients new_medium_url_variants = [ 'http://new-medium.com/with/another/path', 'http://www.new-medium.com/', 'http://new-medium.com/with/path#andanchor' ] for url in new_medium_url_variants: assert guess_medium(db, url)['media_id'] == new_medium['media_id'] # set foreign_rss_links to true to make guess_medium create another new medium db.query("update media set foreign_rss_links = 't' where media_id = %(a)s", {'a': new_medium['media_id']}) another_new_medium = guess_medium(db, new_medium_story_url) assert another_new_medium['media_id'] > new_medium['media_id'] assert another_new_medium['url'] == new_medium_story_url assert another_new_medium['name'] == 'http://new-medium.com/' # now try finding a dup db.query( "update media set dup_media_id = %(a)s where media_id = %(b)s", {'a': media[0]['media_id'], 'b': media[1]['media_id']}) assert guess_medium(db, media[1]['url'])['media_id'] == media[0]['media_id']
def test_add_tweet_story(): db = connect_to_db() topic = create_test_topic(db, 'test') medium = create_test_medium(db, 'test') feed = create_test_feed(db, 'test', medium) source_story = create_test_story(db, 'source', feed) topics_id = topic['topics_id'] db.create('topic_stories', {'topics_id': topics_id, 'stories_id': source_story['stories_id']}) topic_link = {'topics_id': topics_id, 'url': 'u', 'stories_id': source_story['stories_id']} topic_link = db.create('topic_links', topic_link) tfu = {'topics_id': topics_id, 'url': 'u', 'state': 'pending', 'topic_links_id': topic_link['topic_links_id']} tfu = db.create('topic_fetch_urls', tfu) tweet = { 'id': 123, 'text': 'add tweet story tweet text', 'user': {'screen_name': 'tweet screen name'}, 'created_at': 'Mon Dec 13 23:21:48 +0000 2010', 'entities': {'urls': [{'expanded_url': 'http://direct.entity'}]}, 'retweeted_status': {'entities': {'urls': [{'expanded_url': 'http://retweeted.entity'}]}}, 'quoted_status': {'entities': {'urls': [{'expanded_url': 'http://quoted.entity'}]}} } story = _add_tweet_story(db, topic, tweet, [tfu]) got_story = db.require_by_id('stories', story['stories_id']) assert got_story['title'] == "%s: %s" % (tweet['user']['screen_name'], tweet['text']) assert got_story['publish_date'][0:10] == '2010-12-13' assert got_story['url'] == 'https://twitter.com/%s/status/%s' % (tweet['user']['screen_name'], tweet['id']) assert got_story['guid'] == story['url'] got_topic_link = db.require_by_id('topic_links', topic_link['topic_links_id']) assert got_topic_link['ref_stories_id'] == story['stories_id'] assert get_content_for_first_download(db, story) == tweet['text'] got_topic_story = db.query( "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}).hash() assert got_topic_story is not None assert got_topic_story['link_mined'] # noinspection PyTypeChecker for url in [tweet['entities']['urls'][0]['expanded_url'], tweet['retweeted_status']['entities']['urls'][0]['expanded_url'], tweet['quoted_status']['entities']['urls'][0]['expanded_url']]: got_topic_link = db.query( "select * from topic_links where topics_id = %(a)s and url = %(b)s", {'a': topic['topics_id'], 'b': url}).hash() assert got_topic_link is not None
def setUp(self): self.db = connect_to_db() self.test_medium = create_test_medium(db=self.db, label='test') self.test_feed = create_test_feed(db=self.db, label='test', medium=self.test_medium) self.story = create_test_story(db=self.db, label='test', feed=self.test_feed) stories_id = self.story['stories_id'] enclosure = self.db.insert( table='story_enclosures', insert_hash={ 'stories_id': stories_id, # URL doesn't really matter as we won't be fetching it 'url': 'http://example.com/', 'mime_type': 'audio/mpeg', 'length': 100000, }) episode = self.db.insert(table='podcast_episodes', insert_hash={ 'stories_id': stories_id, 'story_enclosures_id': enclosure['story_enclosures_id'], 'gcs_uri': 'gs://whatever', 'duration': 1, 'codec': 'MP3', 'sample_rate': 44100, 'bcp47_language_code': 'en-US', 'speech_operation_id': 'foo', }) self.db.query( """ INSERT INTO podcast_episode_transcript_fetches ( podcast_episodes_id, add_to_queue_at ) VALUES ( %(podcast_episodes_id)s, NOW() ) """, { 'podcast_episodes_id': episode['podcast_episodes_id'], })
def setUp(self): """Add AP medium and some content so that we can find dup sentences.""" super().setUp() ap_medium = create_test_medium(db=self.db(), label=get_ap_medium_name()) feed = create_test_feed(db=self.db(), label='feed', medium=ap_medium) story = create_test_story(db=self.db(), label='story', feed=feed) story['content'] = "\n".join(self.__get_ap_sentences()) add_content_to_test_story(db=self.db(), story=story, feed=feed)
def setUp(self): super().setUp() self.db = connect_to_db() medium = create_test_medium(db=self.db, label='test') feed = create_test_feed(db=self.db, label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db, label='story-%d' % story_num, feed=feed) for sentence_number in range( 1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db.create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format( story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db, label='test') self.topics_id = topic['topics_id'] self.db.query( """ INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db.query( """ INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, { 'topics_id': self.topics_id }).flat()[0] self.db.query( """ INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})
def test_merge_dup_stories(): """Test merge_dup_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') medium = create_test_medium(db, 'merge') feed = create_test_feed(db, 'merge', medium=medium) num_stories = 10 stories = [] for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) stories.append(story) for j in range(i): # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date ) SELECT stories_id, %(sentence_number)s AS sentence_number, 'foo bar' AS sentence, media_id, publish_date FROM stories WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], 'sentence_number': j, }) _merge_dup_stories(db, topic, stories) stories_ids = [s['stories_id'] for s in stories] merged_stories = db.query( """ SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = ANY(%(stories_ids)s) """, { 'topics_id': topic['topics_id'], 'stories_ids': stories_ids, }).flat() assert merged_stories == [stories_ids[-1]]
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_download = create_download_for_feed(self.db(), self.test_feed) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
def test_merge_dup_media_stories(): """Test merge_dup_media_stories().""" db = connect_to_db() topic = create_test_topic(db, 'merge') old_medium = create_test_medium(db, 'merge from') new_medium = create_test_medium(db, 'merge to') feed = create_test_feed(db, 'merge', medium=old_medium) num_stories = 10 for i in range(num_stories): story = create_test_story(db, "merge " + str(i), feed=feed) add_to_topic_stories(db, story, topic) db.update_by_id('media', old_medium['media_id'], {'dup_media_id': new_medium['media_id']}) merge_dup_media_stories(db, topic) got_stories = db.query( """ WITH found_topic_stories AS ( SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s ) SELECT * FROM stories WHERE stories_id IN ( SELECT stories_id FROM found_topic_stories ) """, { 'topics_id': topic['topics_id'] }).hashes() assert len(got_stories) == num_stories for got_story in got_stories: assert got_story['media_id'] == new_medium['media_id']
def setUp(self) -> None: super().setUp() self.test_medium = create_test_medium(self.db(), 'downloads test') self.test_feed = create_test_feed(self.db(), 'downloads test', self.test_medium) self.test_download = create_download_for_feed(self.db(), self.test_feed) self.test_story = create_test_story(self.db(), label='downloads est', feed=self.test_feed) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
def test_update_extractor_version_tag(self): test_medium = create_test_medium(db=self.db(), label='test medium') test_feed = create_test_feed(db=self.db(), label='test feed', medium=test_medium) test_story = create_test_story(db=self.db(), label='test story', feed=test_feed) story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id']) assert len(story_extractor_tags) == 0 update_extractor_version_tag(db=self.db(), story=test_story) story_extractor_tags = self.__story_extractor_tags(stories_id=test_story['stories_id']) assert len(story_extractor_tags) == 1
def __is_syndicated(db: DatabaseHandler, content: str) -> bool: label = content[:64] medium = create_test_medium(db=db, label=label) feed = create_test_feed(db=db, label=label, medium=medium) story = create_test_story(db=db, label=label, feed=feed) story['content'] = content story = add_content_to_test_story(db=db, story=story, feed=feed) return is_syndicated(db=db, story_title=story['title'], story_text=content)
def test_update_media_normalized_urls(): """Test _update_media_normalized_urls().""" db = connect_to_db() [create_test_medium(db, str(i)) for i in range(5)] _update_media_normalized_urls(db) media = db.query("select * from media").hashes() for medium in media: expected_nu = normalize_url_lossy(medium['url']) assert (medium['url'] == expected_nu)
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.db = connect_to_db() self.test_medium = create_test_medium(self.db, 'downloads test') self.test_feed = create_test_feed(self.db, 'downloads test', self.test_medium) self.test_download_feed = create_download_for_feed(self.db, self.test_feed) self.test_story = create_test_story(self.db, label='downloads est', feed=self.test_feed) self.test_download = create_download_for_story(self.db, feed=self.test_feed, story=self.test_story) store_content(db=self.db, download=self.test_download, content=self.__TEST_CONTENT)
def __is_syndicated(self, content: str) -> bool: label = content[:64] medium = create_test_medium(db=self.db(), label=label) feed = create_test_feed(db=self.db(), label=label, medium=medium) story = create_test_story(db=self.db(), label=label, feed=feed) story['content'] = content story = add_content_to_test_story(db=self.db(), story=story, feed=feed) return is_syndicated(db=self.db(), story_title=story['title'], story_text=content)
def setUp(self) -> None: """Set config for tests.""" super().setUp() self.test_medium = create_test_medium(self.db(), self.TEST_MEDIUM_NAME) self.test_feed = create_test_feed(self.db(), self.TEST_FEED_NAME, self.test_medium) self.test_download = create_download_for_feed(self.db(), self.test_feed) self.test_story = create_test_story(self.db(), label=self.TEST_STORY_NAME, feed=self.test_feed) self.test_download['path'] = 'postgresql:foo' self.test_download['state'] = 'success' self.test_download['stories_id'] = self.test_story['stories_id'] self.db().update_by_id('downloads', self.test_download['downloads_id'], self.test_download)
def test_get_spider_feed(): """Test get_spider_feed().""" db = connect_to_db() medium = create_test_medium(db, 'foo') feed = get_spider_feed(db, medium) assert feed['name'] == SPIDER_FEED_NAME assert feed['media_id'] == medium['media_id'] assert feed['active'] is False assert get_spider_feed(db, medium)['feeds_id'] == feed['feeds_id']
def test_get_normalized_title(): db = connect_to_db() # simple title (got_title,) = db.query("select get_normalized_title('foo bar', 0)").flat() assert got_title == 'foo bar' # simple title part title_part = "foo barfoo barfoo barfoo barfoo bar" title = title_part + ': bat baz' (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat() assert got_title == title_part title_part = "foo barfoo barfoo barfoo barfoo bar" title = 'bat baz: ' + title_part (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat() assert got_title == title_part title_part = "foo barfoo barfoo barfoo barfoo bar" title = 'bat baz - ' + title_part (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat() assert got_title == title_part # strip punctuation (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': 'foo!@#bar&*('}).flat() assert got_title == 'foobar' # require 32 character length (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': 'foo bar: bat'}).flat() assert got_title == 'foo barSEPSEP bat' # don't allow medium name as title part medium_name = 'A' * 64 create_test_medium(db, medium_name) title = medium_name + ': foo bar' (got_title,) = db.query("select get_normalized_title(%(title)s, 1)", {'title': title}).flat() assert got_title == medium_name.lower() + 'SEPSEP foo bar'
def setUp(self): super().setUp() medium = create_test_medium(db=self.db(), label='test') feed = create_test_feed(db=self.db(), label='feed', medium=medium) for story_num in range(self.TEST_STORY_COUNT): story = create_test_story(db=self.db(), label='story-%d' % story_num, feed=feed) for sentence_number in range(1, self.TEST_SENTENCE_PER_STORY_COUNT + 1): self.db().create(table='story_sentences', insert_hash={ 'stories_id': story['stories_id'], 'media_id': medium['media_id'], 'publish_date': story['publish_date'], 'sentence_number': sentence_number, 'sentence': 'story {}, sentence {}'.format(story['stories_id'], sentence_number), }) # Test topic topic = create_test_topic(db=self.db(), label='test') self.topics_id = topic['topics_id'] self.db().query(""" INSERT INTO topic_stories (topics_id, stories_id) SELECT %(topics_id)s, stories_id FROM stories """, {'topics_id': self.topics_id}) # Test snapshot self.snapshots_id = self.db().query(""" INSERT INTO snapshots (topics_id, snapshot_date, start_date, end_date) VALUES (%(topics_id)s, NOW(), NOW(), NOW()) RETURNING snapshots_id """, {'topics_id': self.topics_id}).flat()[0] self.db().query(""" INSERT INTO snap.stories (snapshots_id, media_id, stories_id, url, guid, title, publish_date, collect_date) SELECT %(snapshots_id)s, media_id, stories_id, url, guid, title, publish_date, collect_date FROM stories """, {'snapshots_id': self.snapshots_id})