def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, { 'a': topic['topics_id'] }).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() content = '' try: content = fetch_content(db, download) except Exception as ex: log.warning( f"Unable to fetch content for download {download['downloads_id']}: {ex}" ) # postgres will complain if the content has a null in it content = content.replace('\x00', '') db.begin() db.create( 'topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( """ update topic_links set ref_stories_id = null, link_spidered = 'f' where topics_id = %(b)s and ref_stories_id = %(a)s """, { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.commit()
def extract(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> dict: """Extract the content for the given download. Arguments: db - db handle download - download dict from db use_cache - get and set results in extractor cache Returns: see extract_content() below """ download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] if extractor_args.use_cache(): log.debug("Fetching cached extractor results for download {}...".format(downloads_id)) results = _get_extractor_results_cache(db, download) if results is not None: return results log.debug("Fetching content for download {}...".format(downloads_id)) content = fetch_content(db, download) log.debug("Extracting {} characters of content for download {}...".format(len(content), downloads_id)) results = extract_content(content) log.debug( "Done extracting {} characters of content for download {}.".format(len(content), downloads_id)) if extractor_args.use_cache(): log.debug("Caching extractor results for download {}...".format(downloads_id)) _set_extractor_results_cache(db, download, results) return results
def _get_extracted_html(db: DatabaseHandler, story: dict) -> str: """Get the extracted html for the story. We don't store the extracted html of a story, so we have to get the first download assoicated with the story and run the extractor on it. """ download = db.query( """ with d as ( select * from downloads where stories_id = %(a)s and type = 'content' and state = 'success' ) -- goofy cte to avoid bad query plan select * from d order by downloads_id limit 1 """, { 'a': story['stories_id'] }).hash() if not download: return '' html = fetch_content(db, download) extract = extract_article_html_from_page_html(html) extracted_html = extract['extracted_html'] return extracted_html
def test_generate_story(): """Test generate_story().""" db = connect_to_db() story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>' story_url = 'http://foo.com/foo/bar' story = generate_story(db=db, url=story_url, content=story_content) assert 'stories_id' in story assert story['title'] == 'foo bar' assert story['publish_date'] == '2016-01-12 03:55:46' assert story['url'] == story_url assert story['guid'] == story_url medium = db.require_by_id('media', story['media_id']) assert medium['name'] == 'foo.com' assert medium['url'] == 'http://foo.com/' feed = db.query( "select f.* from feeds f join feeds_stories_map fsm using ( feeds_id ) where stories_id = %(a)s", { 'a': story['stories_id'] }).hash() assert feed is not None assert feed['name'] == SPIDER_FEED_NAME (date_tag, date_tag_set) = get_story_date_tag(db, story) assert date_tag['tag'] == 'guess_by_tag_meta' assert date_tag_set['name'] == GUESS_METHOD_TAG_SET download = db.query("select * from downloads where stories_id = %(a)s", { 'a': story['stories_id'] }).hash() assert download is not None assert download['url'] == story['url'] content = fetch_content(db, download) assert content == story_content story = generate_story( db=db, url='http://fallback.date', content='foo', fallback_date='2011-11-11', ) assert story['publish_date'] == '2011-11-11 00:00:00' matched_story = generate_story(db, story['url'], 'foo') assert matched_story['stories_id'] == story['stories_id'] story = generate_story(db=db, url='invalid url', content='foo') assert story is not None
def test_copy_story_to_new_medium(): """Test copy_story_to_new_medium.""" db = connect_to_db() topic = create_test_topic(db, 'copy foo') new_medium = create_test_medium(db, 'copy new') old_medium = create_test_medium(db, 'copy old') old_feed = create_test_feed(db=db, label='copy old', medium=old_medium) old_story = create_test_story(db=db, label='copy old', feed=old_feed) add_content_to_test_story(db, old_story, old_feed) add_to_topic_stories(db, old_story, topic) new_story = copy_story_to_new_medium(db, topic, old_story, new_medium) assert db.find_by_id('stories', new_story['stories_id']) is not None for field in 'title url guid publish_date'.split(): assert old_story[field] == new_story[field] topic_story_exists = db.query(""" SELECT * FROM topic_stories WHERE topics_id = %(topics_id)s AND stories_id = %(stories_id)s """, { 'topics_id': topic['topics_id'], 'stories_id': new_story['stories_id'], }).hash() assert topic_story_exists is not None new_download = db.query(""" SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hash() assert new_download is not None content = fetch_content(db, new_download) assert content is not None and len(content) > 0 story_sentences = db.query(""" SELECT * FROM story_sentences WHERE stories_id = %(stories_id)s """, { 'stories_id': new_story['stories_id'], }).hashes() assert len(story_sentences) > 0
def store_and_verify_content(db: DatabaseHandler, download: dict, content: str) -> None: """Call store content and then poll verifying that the content has been stored. Only return once we have verified that the content has been stored. Raise an error after a timeout if the content is not found. It seems like S3 content is not available for fetching until a small delay after writing it. This function makes sure the content is there once the store operation is done. """ store_content(db, download, content) tries = 0 while True: try: fetch_content(db, download) break except Exception as e: if tries > STORE_CONTENT_TIMEOUT: raise e log.debug(f"story_and_verify_content: waiting to retry verification ({tries}) ...") tries += 1 time.sleep(1)
def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]: """Parse youtube embedded video urls out of the full html of the story. This function looks for youtube embed links anywhere in the html of the story content, rather than just in the extracted html. It aims to return a superset of all youtube embed links by returning every iframe src= attribute that includes the string 'youtube'. Arguments: db - db handle story - story dict from db Returns: list of string urls """ download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY stories_id LIMIT 1 """, { 'stories_id': story['stories_id'], }).hash() html = fetch_content(db, download) soup = BeautifulSoup(html, 'lxml') links = [] for tag in soup.find_all('iframe', src=True): url = tag['src'] if 'youtube' not in url: continue if not url.lower().startswith('http'): url = 'http:' + url url = url.strip() url = url.replace('youtube-embed', 'youtube') links.append(url) return links
def test_fetch_content(self) -> None: """Test fetch_content by manually storing using the PostgreSQL store and then trying to fetch it.""" db = self._db with self.assertRaises(McDBIDownloadsException): fetch_content(db=db, download={}) with self.assertRaises(McDBIDownloadsException): fetch_content(db=db, download={ 'downloads_id': 1, 'state': 'error' }) amazon_s3_downloads_config = _default_amazon_s3_downloads_config() class DoNotReadAllFromS3DownloadStorageConfig(DownloadStorageConfig): @staticmethod def read_all_from_s3(): return False @staticmethod def fallback_postgresql_to_s3(): return False store = _get_store_for_reading( download=self.test_download, amazon_s3_downloads_config=amazon_s3_downloads_config, download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(), ) content = 'foo bar' store.store_content(db=db, object_id=self.test_download['downloads_id'], content=content) got_content = fetch_content( db=db, download=self.test_download, download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(), ) assert got_content == content content = b'foo bar' store.store_content(db=db, object_id=self.test_download['downloads_id'], content=content) got_content = fetch_content( db=db, download=self.test_download, download_storage_config=DoNotReadAllFromS3DownloadStorageConfig(), ) assert got_content == content.decode()
def _get_extracted_html(db: DatabaseHandler, story: dict) -> str: """Get the extracted html for the story. We don't store the extracted html of a story, so we have to get the first download assoicated with the story and run the extractor on it. """ download = db.query( """ WITH d AS ( SELECT * FROM downloads WHERE stories_id = %(stories_id)s AND type = 'content' AND state = 'success' ) -- goofy cte to avoid bad query plan SELECT * FROM d ORDER BY downloads_id LIMIT 1 """, { 'stories_id': story['stories_id'], }).hash() if not download: return '' html = fetch_content(db, download) # avoid extracting large binary files if '<' not in html[0:1000]: if 'http' in html: return html[0:1000000] else: return '' extract = extract_content(html) extracted_html = extract['extracted_html'] return extracted_html
def test_full_chain(self): transcript = None handler = DefaultHandler() for x in range(1, 60 + 1): log.info(f"Waiting for transcript to be finished (#{x})...") podcast_episode_transcript_fetches_id = self.transcript_fetches[0][ 'podcast_episode_transcript_fetches_id'] transcript = handler.fetch_transcript( db=self.db, podcast_episode_transcript_fetches_id= podcast_episode_transcript_fetches_id) if transcript: log.info("Transcript is here!") break time.sleep(2) assert transcript assert transcript.stories_id assert len(transcript.utterances) == 1 assert len(transcript.utterances[0].alternatives) == 1 assert 'kim kardashian' in transcript.utterances[0].alternatives[ 0].text.lower() downloads_id = handler.store_transcript(db=self.db, transcript=transcript) download = self.db.find_by_id(table='downloads', object_id=downloads_id) raw_download = fetch_content(db=self.db, download=download) assert raw_download assert 'kim kardashian' in raw_download.lower()
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = db.create( 'stories', { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] }, ) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) for old_story_tag in db.query( """ SELECT tags_id FROM stories_tags_map WHERE stories_id = %(stories_id)s ORDER BY tags_id """, {'stories_id': old_story['stories_id']}, ).hashes(): stories_id = story['stories_id'] tags_id = old_story_tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': old_story['stories_id'], } ).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ INSERT INTO download_texts ( downloads_id, download_text, download_text_length ) SELECT %(downloads_id)s, dt.download_text, dt.download_text_length FROM download_texts AS dt WHERE dt.downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'], }, ) # noinspection SqlInsertValues db.query( """ INSERT INTO story_sentences ( stories_id, sentence_number, sentence, media_id, publish_date, language ) SELECT %(new_stories_id)s, sentence_number, sentence, media_id, publish_date, language FROM story_sentences WHERE stories_id = %(old_stories_id)s """, { 'old_stories_id': old_story['stories_id'], 'new_stories_id': int(story['stories_id']), }, ) return story
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query(""" WITH topic_stories_from_topic AS ( SELECT stories_id FROM topic_stories WHERE topics_id = %(topics_id)s AND (NOT valid_foreign_rss_story) ) SELECT stories.* FROM stories INNER JOIN media ON stories.media_id = media.media_id AND media.foreign_rss_links WHERE stories.stories_id IN ( SELECT stories_id FROM topic_stories_from_topic ) """, { 'topics_id': topic['topics_id'], }).hashes() for story in stories: download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s ORDER BY downloads_id LIMIT 1 """, { 'stories_id': story['stories_id'], } ).hash() content = '' try: content = fetch_content(db, download) except Exception as ex: log.warning(f"Unable to fetch content for download {download['downloads_id']}: {ex}") # postgres will complain if the content has a null in it content = content.replace('\x00', '') db.begin() db.create( 'topic_seed_urls', { 'topics_id': topic['topics_id'], 'url': story['url'], 'source': 'merge_foreign_rss_stories', 'content': content, }, ) db.query( """ UPDATE topic_links SET ref_stories_id = NULL, link_spidered = 'f' WHERE topics_id = %(topics_id)s AND ref_stories_id = %(ref_stories_id)s """, { 'ref_stories_id': story['stories_id'], 'topics_id': topic['topics_id'], }, ) db.query( """ DELETE FROM topic_stories WHERE stories_id = %(stories_id)s AND topics_id = %(topics_id)s """, { 'stories_id': story['stories_id'], 'topics_id': topic['topics_id'], }, ) db.commit()
async def test_workflow(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() # Not localhost as this might get fetched from a remote worker mp3_url = hs.page_url('/test.mp3') db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) client = workflow_client() # Start worker factory = WorkerFactory(client=client, namespace=client.namespace) worker = factory.new_worker(task_queue=TASK_QUEUE) # Use an activities implementation with random GCS prefixes set activities = _RandomPrefixesPodcastTranscribeActivities() worker.register_activities_implementation( activities_instance=activities, activities_cls_name=PodcastTranscribeActivities.__name__, ) worker.register_workflow_implementation_type( impl_cls=PodcastTranscribeWorkflowImpl) factory.start() # Initialize workflow instance workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( cls=PodcastTranscribeWorkflow, workflow_options=WorkflowOptions( workflow_id=str(stories_id), # By default, if individual activities of the workflow fail, they will get restarted pretty much # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed. workflow_run_timeout=timedelta(minutes=5), ), ) # Wait for the workflow to complete await workflow.transcribe_episode(stories_id) downloads = db.select(table='downloads', what_to_select='*').hashes() assert len(downloads) == 1 first_download = downloads[0] assert first_download['stories_id'] == stories_id assert first_download['type'] == 'content' assert first_download['state'] == 'success' download_content = fetch_content(db=db, download=first_download) # It's what gets said in the sample MP3 file assert 'Kim Kardashian' in download_content # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster() # doesn't have to wait that long await worker.stop(background=True) log.info("Cleaning up GCS...") GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object( object_id=str(stories_id)) GCSStore( bucket_config=activities.config.transcoded_episodes()).delete_object( object_id=str(stories_id)) GCSStore(bucket_config=activities.config.transcripts()).delete_object( object_id=str(stories_id)) log.info("Cleaned up GCS") log.info("Stopping workers...") await stop_worker_faster(worker) log.info("Stopped workers")
def test_generate_story(): """Test generate_story().""" db = connect_to_db() story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>' story_url = 'http://foo.com/foo/bar' story = generate_story(db=db, url=story_url, content=story_content) assert 'stories_id' in story assert story['title'] == 'foo bar' assert story['publish_date'] == '2016-01-12 03:55:46' assert story['url'] == story_url assert story['guid'] == story_url medium = db.require_by_id('media', story['media_id']) assert medium['name'] == 'foo.com' assert medium['url'] == 'http://foo.com/' feed = db.query( """ SELECT f.* FROM feeds_stories_map AS fsm INNER JOIN feeds AS f ON fsm.feeds_id = f.feeds_id WHERE fsm.stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], }).hash() assert feed is not None assert feed['name'] == SPIDER_FEED_NAME (date_tag, date_tag_set) = get_story_date_tag(db, story) assert date_tag['tag'] == 'guess_by_tag_meta' assert date_tag_set['name'] == GUESS_METHOD_TAG_SET download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], }).hash() assert download is not None assert download['url'] == story['url'] content = fetch_content(db, download) assert content == story_content story = generate_story( db=db, url='http://fallback.date', content='foo', ) assert story['publish_date'] == None matched_story = generate_story(db, story['url'], 'foo') assert matched_story['stories_id'] == story['stories_id'] story = generate_story(db=db, url='invalid url', content='foo') assert story is not None