def process_extracted_story(db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments) -> None: """Do post extraction story processing work by calling update_story_sentences_and_language().""" story = decode_object_from_bytes_if_needed(story) stories_id = story['stories_id'] log.debug( "Updating sentences and language for story {}...".format(stories_id)) update_story_sentences_and_language(db=db, story=story, extractor_args=extractor_args) # Extract -> CLIFF -> NYTLabels -> mark_as_processed() chain if story_is_english_and_has_sentences(db=db, stories_id=stories_id): # If CLIFF annotator is enabled, cliff/update_story_tags job will check whether NYTLabels annotator is enabled, # and if it is, will pass the story further to NYTLabels. NYTLabels, in turn, will mark the story as processed. log.debug( "Adding story {} to CLIFF annotation/tagging queue...".format( stories_id)) JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag' ).add_to_queue(stories_id=stories_id) else: log.debug( "Won't add {} to CLIFF annotation/tagging queue because it's not annotatable with CLIFF" .format(stories_id)) if story_is_english_and_has_sentences(db=db, stories_id=stories_id): # If CLIFF annotator is disabled, pass the story to NYTLabels annotator which, if run, will mark the story # as processed log.debug( "Adding story {} to NYTLabels annotation/tagging queue...". format(stories_id)) JobBroker( queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag' ).add_to_queue(stories_id=stories_id) else: log.debug( "Won't add {} to NYTLabels annotation/tagging queue because it's not annotatable with NYTLabels" .format(stories_id)) # If neither of the annotators are enabled, mark the story as processed ourselves log.debug("Marking the story as processed...") if not mark_as_processed(db=db, stories_id=stories_id): raise McProcessExtractedStoryException( "Unable to mark story ID {} as processed".format( stories_id))
def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existing: bool = False) -> None: """Extract, vector and process a story.""" global _consecutive_requeues # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) if not stories_id: raise McExtractAndVectorException("'stories_id' is not set.") db = connect_to_db() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McExtractAndVectorException( "Story with ID {} was not found.".format(stories_id)) if medium_is_locked(db=db, media_id=story['media_id']): log.warning( "Requeueing job for story {} in locked medium {}...".format( stories_id, story['media_id'])) _consecutive_requeues += 1 # Prevent spamming these requeue events if the locked media source is the only one in the queue if _consecutive_requeues > _SLEEP_AFTER_REQUEUES: log.warning( "Story extraction job has been requeued more than {} times, waiting before requeueing..." .format(_consecutive_requeues)) time.sleep(1) JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id) return _consecutive_requeues = 0 log.info("Extracting story {}...".format(stories_id)) db.begin() try: extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: raise McExtractAndVectorException( "Extractor died while extracting story {}: {}".format( stories_id, ex)) db.commit() log.info("Done extracting story {}.".format(stories_id))
def run_cliff_tags_from_annotation(stories_id: int) -> None: """Fetch story's CLIFF annotation and uses it to generate/store tags""" if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) if stories_id is None: raise McCLIFFTagsFromAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFTagsFromAnnotationJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFTagsFromAnnotation() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFTagsFromAnnotationJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag' ).add_to_queue(stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id)
def run_cliff_fetch_annotation(stories_id: int) -> None: """Fetch story's CLIFF annotation.""" if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) if stories_id is None: raise McCLIFFFetchAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id) cliff = CLIFFAnnotatorFetcher() try: cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex)) log.info("Adding story ID %d to the update story tags queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id)
def store_response(self, db: DatabaseHandler, download: dict, response: Response) -> None: download = decode_object_from_bytes_if_needed(download) downloads_id = download['downloads_id'] download_url = download['url'] log.info(f"Handling download {downloads_id}...") log.debug( f"(URL of download {downloads_id} which is about to be handled: {download_url})" ) if not response.is_success(): log.info( f"Download {downloads_id} errored: {response.decoded_content()}" ) self._store_failed_download_error_message(db=db, download=download, response=response) return supported_content_types_regex = re.compile( r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE) if re.search(supported_content_types_regex, response.content_type() or ''): content = response.decoded_content() else: content = '(unsupported content type)' db.query( """ UPDATE downloads SET url = %(download_url)s WHERE downloads_id = %(downloads_id)s AND url != %(download_url)s """, { 'downloads_id': downloads_id, 'download_url': download_url, }) story_ids_to_extract = self.store_download(db=db, download=download, content=content) for stories_id in story_ids_to_extract: log.debug( f"Adding story {stories_id} for download {downloads_id} to extraction queue..." ) JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) log.info(f"Handled download {downloads_id}...") log.debug( f"(URL of download {downloads_id} that was just handled: {download_url})" )
def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: story_ids = super().add_stories_from_feed(db=db, download=download, content=content) # Add a podcast-transcribe-episode job for every newly added story for stories_id in story_ids: log.info(f"Adding a podcast episode transcribe job for story {stories_id}...") JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode').add_to_queue(stories_id=stories_id) return story_ids
def run_topics_fetch_link(topic_fetch_urls_id: int, domain_timeout: Optional[int] = None) -> None: """Fetch a link for a topic and either match it to an existing story or generate a story from it. Almost all of the interesting functionality here happens in fetch_topic_url(). The code here just deals with routing, including requeueing responses throttled by mediawords.util.web.user_agent.throttled.""" global _consecutive_requeues if isinstance(topic_fetch_urls_id, bytes): topic_fetch_urls_id = decode_object_from_bytes_if_needed( topic_fetch_urls_id) topic_fetch_urls_id = int(topic_fetch_urls_id) if topic_fetch_urls_id is None: raise McFetchLinkJobException("'topic_fetch_urls_id' is None.") db = connect_to_db() # FIXME topics_id could be passed as an argument topics_id = db.query( """ SELECT topics_id FROM topic_fetch_urls WHERE topic_fetch_urls_id = %(topic_fetch_urls_id)s """, { 'topic_fetch_urls_id': topic_fetch_urls_id }).flat()[0] log.info( f"Starting fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}" ) try: if not fetch_topic_url_update_state( db=db, topics_id=topics_id, topic_fetch_urls_id=topic_fetch_urls_id, domain_timeout=domain_timeout): JobBroker(queue_name=QUEUE_NAME).add_to_queue( topic_fetch_urls_id=topic_fetch_urls_id) _consecutive_requeues += 1 if _consecutive_requeues > REQUEUES_UNTIL_SLEEP: log.info("sleeping after %d consecutive retries ..." % _consecutive_requeues) time.sleep(1) except Exception as ex: # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the # "consecutive retries" here log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}") _consecutive_requeues = 0 log.info( f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}" )
def add_us_media_to_sitemap_queue(): us_media_ids = [ 104828, 1089, 1092, 1095, 1098, 1101, 1104, 1110, 1145, 1149, 1150, 14, 15, 1747, 1750, 1751, 1752, 1755, 18268, 18710, 18775, 18839, 18840, 19334, 19643, 1, 22088, 25349, 25499, 27502, 2, 40944, 4415, 4419, 4442, 4, 6218, 623382, 64866, 65, 6, 751082, 7, 8, ] us_media_ids = sorted(us_media_ids) for media_id in us_media_ids: log.info("Adding media ID %d" % media_id) JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
async def add_to_extraction_queue(self, stories_id: int) -> None: log.info(f"Adding an extraction job for story {stories_id}...") job_broker = JobBroker( queue_name='MediaWords::Job::ExtractAndVector', rabbitmq_config=RabbitMQConfig( # Keep RabbitMQ's timeout smaller than the action's "start_to_close_timeout" timeout=60, # Disable retries as Temporal will be the one that does all the retrying retries=None, ), ) # add_to_queue() is not idempotent but it's not a big deal to extract a single story twice job_broker.add_to_queue(stories_id=stories_id) log.info(f"Done adding an extraction job for story {stories_id}")
def setup_test_index(db: DatabaseHandler) -> None: """ Run a full Solr import based on the current PostgreSQL database. Due to a failsafe built into generate_and_import_data(), the delete of the collection data will fail if there are more than 100 million sentences in the index (to prevent accidental deletion of production data). """ queue_all_stories(db) JobBroker(queue_name='MediaWords::Job::ImportSolrDataForTesting').run_remotely(full=True, throttle=False)
def add_all_media_to_sitemap_queue(db: DatabaseHandler): """Add all media IDs to XML sitemap fetching queue.""" log.info("Fetching all media IDs...") media_ids = db.query(""" SELECT media_id FROM media ORDER BY media_id """).flat() for media_id in media_ids: log.info("Adding media ID %d" % media_id) JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None: """Given a ap story return by get_new_stories(), add it to the database.""" ap_medium = db.query( """ SELECT * FROM media WHERE name = %(medium_name)s """, { 'medium_name': AP_MEDIUM_NAME, }).hash() ap_feed = { 'media_id': ap_medium['media_id'], 'name': 'API Feed', 'active': False, 'type': 'syndicated', 'url': 'http://ap.com' } ap_feed = db.find_or_create('feeds', ap_feed) story = { 'guid': ap_story['guid'], 'url': ap_story['url'], 'publish_date': ap_story['publish_date'], 'title': ap_story['title'], 'description': ap_story['description'], 'media_id': ap_medium['media_id'] } story = add_story(db, story, ap_feed['feeds_id']) if not story: return story_download = create_download_for_new_story(db, story, ap_feed) download_text = { 'downloads_id': story_download['downloads_id'], 'download_text': ap_story['text'], 'download_text_length': len(ap_story['text']) } db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s) """, download_text) # Send to the extractor for it to do vectorization, language detection, etc. JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=story['stories_id'], use_existing=True, )
def _extract_story(story: dict) -> None: """Process the story through the extractor.""" if url_has_binary_extension(story['url']): return if re2.search(r'livejournal.com\/(tag|profile)', story['url'], re2.I): return JobBroker(queue_name='MediaWords::Job::ExtractAndVector').run_remotely( stories_id=story['stories_id'], use_cache=True, use_existing=True, )
def run_podcast_fetch_transcript( podcast_episode_transcript_fetches_id: int) -> None: """Fetch a completed episode transcripts from Speech API for story.""" if isinstance(podcast_episode_transcript_fetches_id, bytes): podcast_episode_transcript_fetches_id = decode_object_from_bytes_if_needed( podcast_episode_transcript_fetches_id) podcast_episode_transcript_fetches_id = int( podcast_episode_transcript_fetches_id) if not podcast_episode_transcript_fetches_id: fatal_error("'podcast_episode_transcript_fetches_id' is unset.") db = connect_to_db() log.info( f"Fetching transcript for fetch ID {podcast_episode_transcript_fetches_id}..." ) try: stories_id = fetch_store_transcript( db=db, podcast_episode_transcript_fetches_id= podcast_episode_transcript_fetches_id, ) if stories_id: JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) except McPodcastFetchTranscriptSoftException as ex: # Soft exceptions log.error( f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}" ) raise ex except Exception as ex: # Hard and other exceptions fatal_error((f"Fatal / unknown error while fetching transcript " f"for ID {podcast_episode_transcript_fetches_id}: {ex}")) log.info( f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}" )
def add_colombia_media_to_sitemap_queue(): colombia_media_ids = [ 38871, 40941, 42072, 57482, 58360, 58430, 58660, 59058, 59589, 60338, 61607, 62209, 63889, 63921, 74622, 120254, 127258, 211343, 277109, 280236, 281924, 282160, 282256, 282463, 282769, 283998, 297900, 324728, 325564, 325966, 326385, 326782, 328053, 329452, 329735, 330235, 330576, 331318, 331987, 336326, 336339, 336682, 336993, 340969, 341040, 347037, 347551, 348018, 348021, 348023, 348024, 348026, 348029, 348031, 348032, 348033, 348034, 348035, 348037, 348038, 348040, 348041, 348043, 348044, 348048, 348049, 348050, 348052, 348054, 348058, 348060, 348061, 348062, 348063, 348064, 348066, 348067, 348068, 348069, 348070, 348072, 348073, 348074, 348075, 348077, 348078, 348079, 348081, 348083, 348084, 357882, 359251, 362163, 362287, 362386, 362587, 363868, 467798, 540413, 552466, 552579, 558121, 559945, 563374, 565190, 565808, 567421, 651490, 651491, 651492, 651493, 651494, 655394, 655395, 683226, 683288, 683554, 695708, 695709, 695710, 695711, 695712, 695713, 695715, 845114, 849762, 879769, 1180124, 1195863, 1195913, 1207868, 1208757, 1265854, # Extra media sources not in collection 326186, 855592, 879585, 851767, ] colombia_media_ids = sorted(colombia_media_ids) for media_id in colombia_media_ids: log.info("Adding media ID %d" % media_id) JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
def setUp(self) -> None: super().setUp() self.db = connect_to_db() test_medium = create_test_medium(db=self.db, label='test') test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running # at the same time self.stories_id = random.randint(1, 2147483647 - 1) self.db.query( """ INSERT INTO stories ( stories_id, media_id, url, guid, title, description, publish_date, collect_date, full_text_rss ) VALUES ( %(stories_id)s, %(media_id)s, 'http://story.test/', 'guid://story.test/', 'story', 'description', '2016-10-15 08:00:00', '2016-10-15 10:00:00', true ) """, { 'stories_id': self.stories_id, 'media_id': test_feed['media_id'], }) # Create missing partitions for "feeds_stories_map" self.db.query('SELECT create_missing_partitions()') self.db.create(table='feeds_stories_map', insert_hash={ 'feeds_id': int(test_feed['feeds_id']), 'stories_id': self.stories_id, }) assert os.path.isfile(self.input_media_path( )), f"Test media file '{self.input_media_path()}' should exist." with open(self.input_media_path(), mode='rb') as f: test_data = f.read() # noinspection PyUnusedLocal def __media_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode( 'utf-8') response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_data return response port = 8080 # Port exposed on docker-compose.tests.yml media_path = '/test_media_file' pages = { media_path: { 'callback': __media_callback, } } self.hs = HashServer(port=port, pages=pages) self.hs.start() # Using our hostname as it will be another container that will be connecting to us media_url = f'http://{socket.gethostname()}:{port}{media_path}' self.db.insert(table='story_enclosures', insert_hash={ 'stories_id': self.stories_id, 'url': media_url, 'mime_type': self.input_media_mime_type(), 'length': len(test_data), }) # Add a "podcast-fetch-episode" job JobBroker( queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue( stories_id=self.stories_id) total_time = int(self.retries_per_step() * self.seconds_between_retries()) # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes" episodes = None for x in range(1, self.retries_per_step() + 1): log.info(f"Waiting for episode to appear (#{x})...") episodes = self.db.select(table='podcast_episodes', what_to_select='*').hashes() if episodes: log.info(f"Episode is here!") break time.sleep(self.seconds_between_retries()) assert episodes, f"Episode didn't show up in {total_time} seconds." # Wait for "podcast-submit-operation" to submit Speech API operation self.transcript_fetches = None for x in range(1, self.retries_per_step() + 1): log.info(f"Waiting for transcript fetch to appear (#{x})...") self.transcript_fetches = self.db.select( table='podcast_episode_transcript_fetches', what_to_select='*').hashes() if self.transcript_fetches: log.info(f"Transcript fetch is here!") break time.sleep(self.seconds_between_retries()) assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds."
if topics_id is None: raise McExtractStoryLinksJobException("'topics_id' is None.") stories_id = int(stories_id) topics_id = int(topics_id) db = connect_to_db() log.info("Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) try: extract_links_for_topic_story(db=db, stories_id=stories_id, topics_id=topics_id) except Exception as ex: log.error("Error while processing story {}: {}".format(stories_id, ex)) raise McExtractStoryLinksJobException( "Unable to process story {}: {}".format(stories_id, traceback.format_exc())) log.info( "Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks') app.start_worker(handler=run_topics_extract_story_links)
stories_id = decode_object_from_bytes_if_needed(stories_id) if stories_id is None: raise McCLIFFFetchAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id) cliff = CLIFFAnnotatorFetcher() try: cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex)) log.info("Adding story ID %d to the update story tags queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotation') app.start_worker(handler=run_cliff_fetch_annotation)
def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript' ).add_to_queue(podcast_episode_transcript_fetches_id= podcast_episode_transcript_fetches_id, )
db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFTagsFromAnnotationJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFTagsFromAnnotation() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFTagsFromAnnotationJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag' ).add_to_queue(stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag') app.start_worker(handler=run_cliff_tags_from_annotation)
db=db, podcast_episode_transcript_fetches_id= podcast_episode_transcript_fetches_id, ) if stories_id: JobBroker( queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=stories_id) except McPodcastFetchTranscriptSoftException as ex: # Soft exceptions log.error( f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}" ) raise ex except Exception as ex: # Hard and other exceptions fatal_error((f"Fatal / unknown error while fetching transcript " f"for ID {podcast_episode_transcript_fetches_id}: {ex}")) log.info( f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}" ) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript') app.start_worker(handler=run_podcast_fetch_transcript)
try: if not fetch_topic_url_update_state( db=db, topics_id=topics_id, topic_fetch_urls_id=topic_fetch_urls_id, domain_timeout=domain_timeout): JobBroker(queue_name=QUEUE_NAME).add_to_queue( topic_fetch_urls_id=topic_fetch_urls_id) _consecutive_requeues += 1 if _consecutive_requeues > REQUEUES_UNTIL_SLEEP: log.info("sleeping after %d consecutive retries ..." % _consecutive_requeues) time.sleep(1) except Exception as ex: # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the # "consecutive retries" here log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}") _consecutive_requeues = 0 log.info( f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}" ) if __name__ == '__main__': app = JobBroker(queue_name=QUEUE_NAME) app.start_worker(handler=run_topics_fetch_link)
"""Generate word2vec model for a given snapshot.""" # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(snapshots_id, bytes): snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() log.info("Generating word2vec model for snapshot %d..." % snapshots_id) sentence_iterator = SnapshotSentenceIterator(db=db, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info("Finished generating word2vec model for snapshot %d." % snapshots_id) if __name__ == '__main__': app = JobBroker( queue_name='MediaWords::Job::Word2vec::GenerateSnapshotModel') app.start_worker(handler=run_word2vec_generate_snapshot_model)
# FIXME could be passed as an argument topics_id = db.query(""" SELECT topics_id FROM timespans WHERE timespans_id = %(timespans_id)s """, { 'timespans_id': timespans_id, }).flat()[0] log.info(f"Generating maps for topic {topics_id}, timespan {timespans_id}") generate_and_store_maps( db=db, topics_id=topics_id, timespans_id=timespans_id, memory_limit_mb=_memory_limit_mb, ) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run topics map worker.") parser.add_argument("-m", "--memory_limit_mb", type=int, required=True, help="Memory limit (MB) for Java subprocess") args = parser.parse_args() _memory_limit_mb = args.memory_limit_mb assert _memory_limit_mb, "Memory limit is not set (no idea what to set -Xmx to)." app = JobBroker(queue_name=QUEUE_NAME) app.start_worker(handler=run_job)
db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFTagger() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFUpdateStoryTagsJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) # log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) # JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation').add_to_queue(stories_id=stories_id) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags') app.start_worker(handler=run_cliff_update_story_tags)
if stories_id is None: raise McNYTLabelsFetchAnnotationJobException("'stories_id' is None.") stories_id = int(stories_id) db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsFetchAnnotationJobException("Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsAnnotatorFetcher() try: nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsFetchAnnotationJobException( "Unable to process story $stories_id with NYTLabels: %s" % str(ex) ) log.info("Adding story ID %d to the update story tags queue..." % stories_id) JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags').add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation') app.start_worker(handler=run_nytlabels_fetch_annotation)
log = create_logger(__name__) async def _start_workflow(stories_id: int) -> None: log.info(f"Starting a workflow for story {stories_id}...") client = workflow_client() workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( cls=PodcastTranscribeWorkflow, workflow_options=WorkflowOptions(workflow_id=str(stories_id)), ) # Fire and forget as the workflow will do everything (including adding a extraction job) itself await WorkflowClient.start(workflow.transcribe_episode, stories_id) log.info(f"Started a workflow for story {stories_id}...") def run_podcast_fetch_episode(stories_id: int) -> None: if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) asyncio.run(_start_workflow(stories_id=stories_id)) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode') app.start_worker(handler=run_podcast_fetch_episode)
log = create_logger(__name__) class McFetchTwitterUrlsJobException(Exception): """Exceptions dealing with job setup and routing.""" pass def run_topics_fetch_twitter_urls(topic_fetch_urls_ids: list): """Fetch a set of twitter urls from the twitter api and add each as a topic story if it matches. All of the interesting logic is in mediawords.tm.fetch_twitter_urls.""" if topic_fetch_urls_ids is None: raise McFetchTwitterUrlsJobException("'topic_fetch_urls_ids' is None.") log.info("Start fetch twitter urls for %d topic_fetch_urls" % len(topic_fetch_urls_ids)) db = connect_to_db() fetch_twitter_urls_update_state(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids) log.info("Finished fetching twitter urls") if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls') app.start_worker(handler=run_topics_fetch_twitter_urls)
add_to_queue_at ) VALUES ( %(podcast_episodes_id)s, NOW() + INTERVAL %(add_to_queue_interval)s ) """, { 'podcast_episodes_id': episode.podcast_episodes_id, 'add_to_queue_interval': add_to_queue_interval, }) except McPodcastSubmitOperationSoftException as ex: # Soft exceptions log.error( f"Unable to submit podcast episode for story {stories_id}: {ex}") raise ex except Exception as ex: # Hard and other exceptions fatal_error( f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}" ) log.info( f"Done submitting story's {stories_id} podcast episode for transcription" ) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation') app.start_worker(handler=run_podcast_submit_operation)
stories_id = int(stories_id) db = connect_to_db() log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsTagger() try: nytlabels.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsUpdateStoryTagsJobException( "Unable to process story ID %d with NYTLabels: %s" % ( stories_id, str(ex), )) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) if __name__ == '__main__': app = JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags') app.start_worker(handler=run_nytlabels_update_story_tags)