def test_extract(self) -> None: """Test extract().""" db = self.db() html = '<script>ignore</script><p>foo</p>' mediawords.dbi.downloads.store_content(db, self.test_download, html) result = mediawords.dbi.downloads.extract(db=db, download=self.test_download) assert result['extracted_html'].strip( ) == '<body id="readabilityBody"><p>foo</p></body>' assert result['extracted_text'].strip() == 'foo.' mediawords.dbi.downloads.store_content(db, self.test_download, html) mediawords.dbi.downloads.extract( db=db, download=self.test_download, extractor_args=PyExtractorArguments(use_cache=True), ) mediawords.dbi.downloads.store_content(db, self.test_download, 'bar') result = mediawords.dbi.downloads.extract( db=db, download=self.test_download, extractor_args=PyExtractorArguments(use_cache=True), ) assert result['extracted_html'].strip( ) == '<body id="readabilityBody"><p>foo</p></body>' assert result['extracted_text'].strip() == 'foo.'
def extract(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> dict: """Extract the content for the given download. Arguments: db - db handle download - download dict from db use_cache - get and set results in extractor cache Returns: see extract_content() below """ download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] if extractor_args.use_cache(): log.debug("Fetching cached extractor results for download {}...".format(downloads_id)) results = _get_extractor_results_cache(db, download) if results is not None: return results log.debug("Fetching content for download {}...".format(downloads_id)) content = fetch_content(db, download) log.debug("Extracting {} characters of content for download {}...".format(len(content), downloads_id)) results = extract_content(content) log.debug( "Done extracting {} characters of content for download {}.".format(len(content), downloads_id)) if extractor_args.use_cache(): log.debug("Caching extractor results for download {}...".format(downloads_id)) _set_extractor_results_cache(db, download, results) return results
def extract_and_create_download_text( db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments) -> dict: """Extract the download and create a download_text from the extracted download.""" download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] log.debug("Extracting download {}...".format(downloads_id)) extraction_result = extract(db=db, download=download, extractor_args=extractor_args) log.debug("Done extracting download {}.".format(downloads_id)) download_text = None if extractor_args.use_existing(): log.debug( "Fetching download text for download {}...".format(downloads_id)) download_text = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': downloads_id }).hash() if download_text is None: log.debug( "Creating download text for download {}...".format(downloads_id)) download_text = create(db=db, download=download, extract=extraction_result) return download_text
def get_extracted_html(db: DatabaseHandler, story: dict) -> str: """Get the extracted html for the story. We don't store the extracted html of a story, so we have to get the first download assoicated with the story and run the extractor on it. """ download = db.query( """ with d as ( select * from downloads where stories_id = %(a)s and type = 'content' and state = 'success' ) -- goofy cte to avoid bad query plan select * from d order by downloads_id limit 1 """, { 'a': story['stories_id'] }).hash() extractor_results = mediawords.dbi.downloads.extract( db, download, PyExtractorArguments(use_cache=True)) return extractor_results['extracted_html']
def extract( db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> dict: """Extract the content for the given download. Arguments: db - db handle download - download dict from db use_cache - get and set results in extractor cache Returns: see extract_content() below """ download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] if extractor_args.use_cache(): log.debug( "Fetching cached extractor results for download {}...".format( downloads_id)) results = _get_extractor_results_cache(db, download) if results is not None: return results log.debug("Fetching content for download {}...".format(downloads_id)) content = fetch_content(db, download) log.debug("Extracting {} characters of content for download {}...".format( len(content), downloads_id)) results = extract_content(content) log.debug( "Done extracting {} characters of content for download {}.".format( len(content), downloads_id)) if extractor_args.use_cache(): log.debug("Caching extractor results for download {}...".format( downloads_id)) _set_extractor_results_cache(db, download, results) return results
def test_extract_and_create_download_text(self): download_text = mediawords.dbi.downloads.extract_and_create_download_text( db=self.db(), download=self.test_download, extractor_args=PyExtractorArguments(), ) assert download_text assert download_text['download_text'] == 'foo.' assert download_text['downloads_id'] == self.test_download[ 'downloads_id']
def run_job(cls, stories_id: int, use_cache: bool = False) -> None: # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) if not stories_id: raise McExtractAndVectorException("'stories_id' is not set.") db = connect_to_db() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McExtractAndVectorException( "Story with ID {} was not found.".format(stories_id)) if medium_is_locked(db=db, media_id=story['media_id']): log.warning( "Requeueing job for story {} in locked medium {}...".format( stories_id, story['media_id'])) ExtractAndVectorJob._consecutive_requeues += 1 # Prevent spamming these requeue events if the locked media source is the only one in the queue if ExtractAndVectorJob._consecutive_requeues > ExtractAndVectorJob._SLEEP_AFTER_REQUEUES: log.warning( "Story extraction job has been requeued more than {} times, waiting before requeueing..." .format(ExtractAndVectorJob._consecutive_requeues)) time.sleep(1) ExtractAndVectorJob.add_to_queue(stories_id=stories_id) return ExtractAndVectorJob._consecutive_requeues = 0 log.info("Extracting story {}...".format(stories_id)) db.begin() try: extractor_args = PyExtractorArguments(use_cache=use_cache) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: raise McExtractAndVectorException( "Extractor died while extracting story {}: {}".format( stories_id, ex)) db.commit() log.info("Done extracting story {}.".format(stories_id))
def _extract_story(db: DatabaseHandler, story: dict) -> None: """Process the story through the extractor.""" if url_has_binary_extension(story['url']): return if re2.search(r'livejournal.com\/(tag|profile)', story['url'], re2.I): return extractor_args = PyExtractorArguments(use_cache=True, use_existing=True, no_dedup_sentences=False) mediawords.dbi.stories.stories.extract_and_process_story(db=db, story=story, extractor_args=extractor_args)
def extract_and_process_story( db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Extract all of the downloads for the given story and then call process_extracted_story().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] use_transaction = not db.in_transaction() if use_transaction: db.begin() log.debug("Fetching downloads for story {}...".format(stories_id)) downloads = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s AND type = 'content' AND state = 'success' ORDER BY downloads_id ASC """, { 'stories_id': stories_id }).hashes() # MC_REWRITE_TO_PYTHON: Perlism if downloads is None: downloads = [] for download in downloads: log.debug("Extracting download {} for story {}...".format( download['downloads_id'], stories_id)) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) log.debug("Processing extracted story {}...".format(stories_id)) process_extracted_story(db=db, story=story, extractor_args=extractor_args) if use_transaction: db.commit()
def process_download_for_extractor( db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be extracted for the story, call process_extracted_story() on the parent story.""" download = decode_object_from_bytes_if_needed(download) stories_id = download['stories_id'] log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url'])) extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args) has_remaining_download = db.query( """ SELECT downloads_id FROM downloads WHERE stories_id = %(stories_id)s AND extracted = 'f' AND type = 'content' """, { 'stories_id': stories_id }).hash() # MC_REWRITE_TO_PYTHON: Perlism if has_remaining_download is None: has_remaining_download = {} if len(has_remaining_download) > 0: log.info("Pending more downloads...") else: story = db.find_by_id(table='stories', object_id=stories_id) process_extracted_story(db=db, story=story, extractor_args=extractor_args)
def process_extracted_story(db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments) -> None: """Do post extraction story processing work by calling update_story_sentences_and_language().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] log.debug("Updating sentences and language for story {}...".format(stories_id)) update_story_sentences_and_language(db=db, story=story, extractor_args=extractor_args) if not extractor_args.no_tag_extractor_version(): log.debug("Updating extractor version tag for story {}...".format(stories_id)) update_extractor_version_tag(db=db, story=story) # Extract -> CLIFF -> NYTLabels -> mark_as_processed() chain cliff = CLIFFAnnotator() if cliff.annotator_is_enabled() and cliff.story_is_annotatable(db=db, stories_id=stories_id): # If CLIFF annotator is enabled, cliff/update_story_tags job will check whether NYTLabels annotator is enabled, # and if it is, will pass the story further to NYTLabels. NYTLabels, in turn, will mark the story as processed. log.debug("Adding story {} to CLIFF annotation queue...".format(stories_id)) CLIFFFetchAnnotationJob.add_to_queue(stories_id=stories_id) else: log.debug("Won't add {} to CLIFF annotation queue because it's not annotatable with CLIFF".format(stories_id)) nytlabels = NYTLabelsAnnotator() if nytlabels.annotator_is_enabled() and nytlabels.story_is_annotatable(db=db, stories_id=stories_id): # If CLIFF annotator is disabled, pass the story to NYTLabels annotator which, if run, will mark the story # as processed log.debug("Adding story {} to NYTLabels annotation queue...".format(stories_id)) NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id) else: log.debug("Won't add {} to NYTLabels annotation queue because it's not annotatable with NYTLabels".format( stories_id )) # If neither of the annotators are enabled, mark the story as processed ourselves log.debug("Marking the story as processed...") if not mark_as_processed(db=db, stories_id=stories_id): raise McProcessExtractedStoryException("Unable to mark story ID {} as processed".format(stories_id))
def extract_and_create_download_text(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments) -> dict: """Extract the download and create a download_text from the extracted download.""" download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] log.debug("Extracting download {}...".format(downloads_id)) extraction_result = extract(db=db, download=download, extractor_args=extractor_args) log.debug("Done extracting download {}.".format(downloads_id)) download_text = None if extractor_args.use_existing(): log.debug("Fetching download text for download {}...".format(downloads_id)) download_text = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': downloads_id}).hash() if download_text is None: log.debug("Creating download text for download {}...".format(downloads_id)) download_text = create(db=db, download=download, extract=extraction_result) return download_text
def update_story_sentences_and_language( db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments = PyExtractorArguments() ) -> None: """Update story vectors for the given story, updating "story_sentences". If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new ones (useful for optimization if you are very sure no story vectors exist for this story). If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are reprocessing a small set of stories). """ story = decode_object_from_bytes_if_needed(story) use_transaction = not db.in_transaction() if use_transaction: db.begin() stories_id = story['stories_id'] if not extractor_args.no_delete(): _delete_story_sentences(db=db, story=story) story_text = story.get('story_text', None) if not story_text: story_text = get_text_for_word_counts(db=db, story=story) if not story_text: story_text = '' story_lang = language_code_for_text(text=story_text) sentences = _get_sentences_from_story_text(story_text=story_text, story_lang=story_lang) if (not story.get('language', None)) or story.get('language', None) != story_lang: db.query( """ UPDATE stories SET language = %(story_lang)s WHERE stories_id = %(stories_id)s """, { 'stories_id': stories_id, 'story_lang': story_lang }) story['language'] = story_lang if sentences is None: raise McUpdateStorySentencesAndLanguageException( "Sentences for story {} are undefined.".format(stories_id)) if len(sentences) == 0: log.debug("Story {} doesn't have any sentences.".format(stories_id)) return sentences = _clean_sentences(sentences) _insert_story_sentences( db=db, story=story, sentences=sentences, no_dedup_sentences=extractor_args.no_dedup_sentences(), ) story['ap_syndicated'] = _update_ap_syndicated( db=db, stories_id=stories_id, story_title=story['title'], story_text=story_text, story_language=story_lang, ) if use_transaction: db.commit()