def get_text_for_word_counts(db: DatabaseHandler, story: dict) -> str: """Get story title + description + body concatenated into a single string. This is what is used to fetch text to generate story_sentences, which eventually get imported into Solr. If the text of the story ends up being shorter than the description, return the title + description instead of the story text (some times the extractor falls down and we end up with better data just using the title + description. """ story = decode_object_from_bytes_if_needed(story) if story['full_text_rss']: story_text = __get_full_text_from_rss(story) else: story_text = _get_extracted_text(db=db, story=story) story_description = story.get('description', '') if story_text is None: story_text = '' if story_description is None: story_description = '' if len(story_text) == 0 or len(story_text) < len(story_description): story_text = html_strip(story['title']).strip() if story_description: story_text += "\n\n" story_text += html_strip(story_description).strip() return story_text
def __get_full_text_from_rss(story: dict) -> str: story = decode_object_from_bytes_if_needed(story) story_title = story.get('title', '') story_description = story.get('description', '') return "\n\n".join([html_strip(story_title), html_strip(story_description)])
async def identify_story_bcp47_language_code( self, stories_id: int) -> Optional[str]: log.info(f"Identifying story language for story {stories_id}...") db = connect_to_db_or_raise() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McPermanentError(f"Story {stories_id} was not found.") # Podcast episodes typically come with title and description set so try guessing from that story_title = story['title'] story_description = html_strip(story['description']) sample_text = f"{story_title}\n{story_description}" bcp_47_language_code = None if identification_would_be_reliable(text=sample_text): iso_639_1_language_code = language_code_for_text(text=sample_text) # Convert to BCP 47 identifier bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( iso_639_1_code=iso_639_1_language_code, url_hint=story['url'], ) log.info( f"Language code for story {stories_id} is {bcp_47_language_code}") return bcp_47_language_code
def import_archive_file(db: DatabaseHandler, file: str) -> None: """Import ap story described by xml in file into database.""" log.debug("import ap file: %s" % file) with open(file) as fd: xml = xmltodict.parse(fd.read()) entry = xml['sATOM']['entry'] body = entry['content']['nitf']['body'] story = dict() story['title'] = body['body.head']['hedline']['hl1']['#text'] story['publish_date'] = entry['published'] story['description'] = body['body.head'].get('abstract', story['title']) story['guid'] = entry['id'].replace('urn:publicid:ap.org:', '') story['url'] = entry['link'][ '@href'] if 'link' in entry else 'http://apnews.com/invalid/%s' % story[ 'guid'] # make sure body.content is the only child of body; otherwise the unparse command below will fail body_content = body.get('body.content', {}) content_block = body_content['block'] if body_content is not None else {} content = xmltodict.unparse({'html': {'content': content_block}}) story['text'] = html_strip(content) _import_ap_story(db, story)
def _call_extractor_on_html(content: str) -> dict: """Call extractor on the content.""" content = decode_object_from_bytes_if_needed(content) extracted_html = extract_article_from_html(content) extracted_text = html_strip(extracted_html) return {'extracted_html': extracted_html, 'extracted_text': extracted_text}
def combine_story_title_description_text(story_title: Optional[str], story_description: Optional[str], download_texts: List[str]) -> str: """Get the combined story title, story description, and download text of the story in a consistent way.""" story_title = decode_object_from_bytes_if_needed(story_title) story_description = decode_object_from_bytes_if_needed(story_description) download_texts = decode_object_from_bytes_if_needed(download_texts) if story_title is None: story_title = '' if story_description is None: story_description = '' return "\n***\n\n".join( [html_strip(story_title), html_strip(story_description)] + download_texts)
def _call_extractor_on_html(content: str) -> dict: """Call extractor on the content.""" content = decode_object_from_bytes_if_needed(content) extractor_results = extract_article_html_from_page_html(content) extracted_html = extractor_results['extracted_html'] extractor_version = extractor_results['extractor_version'] extracted_text = html_strip(extracted_html) return { 'extracted_html': extracted_html, 'extracted_text': extracted_text, 'extractor_version': extractor_version, }
def test_html_strip() -> None: """Test html_strip().""" assert html_strip("<strong>Hellonot </strong>") == "Hellonot" assert html_strip("<script>delete</script><p>body</p>") == "body." assert html_strip("<title>delete</title><p>content</p>") == "content." assert html_strip("<title>delete</title><p>content</p>", include_title=True) == "delete. content." assert html_strip("<p>foo\xAD</p>") == "foo." assert html_strip("&"") == '&"' html_path = mediawords.util.paths.mc_root_path() + '/mediacloud/test-data/html/strip.html' with open(html_path, 'r', encoding='utf8') as fh: html = fh.read() # just santiy test here to make sure there are no errors without having to use a fixture got_text = html_strip(html) assert (len(got_text) > 0.05 * len(html)) assert '<' not in got_text
def fetch_and_store_episode( db: DatabaseHandler, stories_id: int, config: Optional[PodcastFetchEpisodeConfig] = None) -> None: """ Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB. 1) Determines the episode's likely language by looking into its title and description, converts the language code to BCP 47; 1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most; 2) Fetches the chosen enclosure; 3) Transcodes the file (if needed) by: a) converting it to an audio format that the Speech API can support, and / or b) discarding video stream from the media file, and / or c) discarding other audio streams from the media file; 5) Reads the various parameters, e.g. sample rate, of the episode audio file; 4) Uploads the episode audio file to Google Cloud Storage; 5) Adds a row to "podcast_episodes". Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller. :param db: Database handler. :param stories_id: Story ID for the story to operate on. :param config: (optional) Podcast fetcher configuration object (useful for testing). """ if not config: config = PodcastFetchEpisodeConfig() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McStoryNotFoundException(f"Story {stories_id} was not found.") # Try to determine language of the story story_title = story['title'] story_description = html_strip(story['description']) sample_text = f"{story_title}\n{story_description}" iso_639_1_language_code = None if identification_would_be_reliable(text=sample_text): iso_639_1_language_code = language_code_for_text(text=sample_text) if not iso_639_1_language_code: iso_639_1_language_code = 'en' # Convert to BCP 47 identifier bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( iso_639_1_code=iso_639_1_language_code, url_hint=story['url'], ) # Find the enclosure that might work the best best_enclosure = podcast_viable_enclosure_for_story(db=db, stories_id=stories_id) if not best_enclosure: raise McPodcastNoViableStoryEnclosuresException( f"There were no viable enclosures found for story {stories_id}") if best_enclosure.length: if best_enclosure.length > MAX_ENCLOSURE_SIZE: raise McPodcastEnclosureTooBigException( f"Chosen enclosure {best_enclosure} is too big.") try: temp_dir = tempfile.mkdtemp('fetch_and_store') except Exception as ex: raise McPodcastFileStoreFailureException( f"Unable to create temporary directory: {ex}") # Fetch enclosure input_filename = 'input_file' input_file_path = os.path.join(temp_dir, input_filename) log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...") fetch_big_file(url=best_enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}") if os.stat(input_file_path).st_size == 0: # Might happen with misconfigured webservers raise McPodcastFileFetchFailureException( f"Fetched file {input_file_path} is empty.") # Transcode if needed input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=input_filename) transcoded_file_obj = transcode_media_file_if_needed( input_media_file=input_file_obj) # Unset the variable so that we don't accidentally use it later del input_filename, temp_dir if input_file_obj != transcoded_file_obj: # Function did some transcoding and stored everything in yet another file # Remove the input file _cleanup_temp_dir(temp=input_file_obj) # Consider the transcoded file the new input file input_file_obj = transcoded_file_obj # (Re)read the properties of either the original or the transcoded file media_info = media_file_info(media_file_path=input_file_obj.temp_full_path) best_audio_stream = media_info.best_supported_audio_stream() # Store input file to GCS try: gcs = GCSStore(config=config) gcs_uri = gcs.store_object( local_file_path=input_file_obj.temp_full_path, object_id=str(stories_id), mime_type=best_audio_stream.audio_codec_class.mime_type(), ) except Exception as ex: log.error( f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}" ) # Clean up, then raise further _cleanup_temp_dir(temp=input_file_obj) raise ex # Clean up the locally stored file as we don't need it anymore _cleanup_temp_dir(temp=input_file_obj) # Insert everything to the database try: db.query( """ INSERT INTO podcast_episodes ( stories_id, story_enclosures_id, gcs_uri, duration, codec, sample_rate, bcp47_language_code ) VALUES ( %(stories_id)s, %(story_enclosures_id)s, %(gcs_uri)s, %(duration)s, %(codec)s, %(sample_rate)s, %(bcp47_language_code)s ) ON CONFLICT (stories_id) DO UPDATE SET story_enclosures_id = %(story_enclosures_id)s, gcs_uri = %(gcs_uri)s, duration = %(duration)s, codec = %(codec)s, sample_rate = %(sample_rate)s, bcp47_language_code = %(bcp47_language_code)s """, { 'stories_id': stories_id, 'story_enclosures_id': best_enclosure.story_enclosures_id, 'gcs_uri': gcs_uri, 'duration': best_audio_stream.duration, 'codec': best_audio_stream.audio_codec_class.postgresql_enum_value(), 'sample_rate': best_audio_stream.sample_rate, 'bcp47_language_code': bcp_47_language_code, }) except Exception as ex_db: # Try to delete object on GCS first try: gcs.delete_object(object_id=str(stories_id)) except Exception as ex_gcs: # We should be able to delete it as we've just uploaded it raise McPodcastGCSStoreFailureException(( f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; " f"database insert exception: {ex_db}; " f"GCS exception: {ex_gcs}")) raise McPodcastPostgreSQLException( f"Failed inserting episode for story {stories_id}: {ex_db}")
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story