"Start fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) try: db = connect_to_db() story = db.require_by_id(table='stories', object_id=stories_id) topic = db.require_by_id(table='topics', object_id=topics_id) mediawords.tm.extract_story_links.extract_links_for_topic_story( db, story, topic) except Exception as ex: log.error("Error while processing story {}: {}".format( stories_id, ex)) raise McExtractStoryLinksJobException( "Unable to process story {}: {}".format( stories_id, traceback.format_exc())) log.info( "Finished fetching extracting links for stories_id %d topics_id %d" % (stories_id, topics_id)) @classmethod def queue_name(cls) -> str: """Set queue name.""" return 'MediaWords::Job::TM::ExtractStoryLinks' if __name__ == '__main__': app = JobBrokerApp(job_class=ExtractStoryLinksJob) app.start_worker()
snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() log.info("Generating word2vec model for snapshot %d..." % snapshots_id) sentence_iterator = SnapshotSentenceIterator(db=db, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info("Finished generating word2vec model for snapshot %d." % snapshots_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::Word2vec::GenerateSnapshotModel' if __name__ == '__main__': app = JobBrokerApp(job_class=Word2vecGenerateSnapshotModelJob) app.start_worker()
mediawords.tm.fetch_twitter_urls.fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids) except Exception as ex: log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex))) db.query( """ update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now() where topic_fetch_urls_id = any(%(c)s) """, { 'a': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids }) db.disconnect() log.info("Finished fetching twitter url") @classmethod def queue_name(cls) -> str: """Set queue name.""" return 'MediaWords::Job::TM::FetchTwitterUrls' if __name__ == '__main__': try: app = JobBrokerApp(job_class=FetchTwitterUrlsJob) app.start_worker() except BaseException as e: print(str(e))
time.sleep(1) except Exception as ex: # all non throttled errors should get caught by the try: about, but catch again here just in case log.error("Error while fetching URL with ID {}: {}".format( topic_fetch_urls_id, str(ex))) cls._consecutive_requeues = 0 update = { 'state': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR, 'fetch_date': datetime.datetime.now(), 'message': traceback.format_exc(), } db.update_by_id('topic_fetch_urls', topic_fetch_urls_id, update) db.disconnect() log.info("Finished fetch for topic_fetch_url %d" % topic_fetch_urls_id) @classmethod def queue_name(cls) -> str: """Set queue name.""" return 'MediaWords::Job::TM::FetchLink' if __name__ == '__main__': try: app = JobBrokerApp(job_class=FetchLinkJob) app.start_worker() except BaseException as e: print(str(e))
log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsAnnotator() try: nytlabels.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsUpdateStoryTagsJobException( "Unable to process story ID %d with NYTLabels: %s" % ( stories_id, str(ex), )) log.info("Marking story ID %d as processed..." % stories_id) mark_as_processed(db=db, stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::NYTLabels::UpdateStoryTags' if __name__ == '__main__': app = JobBrokerApp(job_class=NYTLabelsUpdateStoryTagsJob) app.start_worker()
db = connect_to_db() log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFFetchAnnotationJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFAnnotator() try: cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFFetchAnnotationJobException( "Unable to process story $stories_id with CLIFF: %s" % str(ex)) log.info("Adding story ID %d to the update story tags queue..." % stories_id) CLIFFUpdateStoryTagsJob.add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::CLIFF::FetchAnnotation' if __name__ == '__main__': app = JobBrokerApp(job_class=CLIFFFetchAnnotationJob) app.start_worker()
ExtractAndVectorJob._consecutive_requeues = 0 log.info("Extracting story {}...".format(stories_id)) db.begin() try: extractor_args = PyExtractorArguments(use_cache=use_cache) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: raise McExtractAndVectorException( "Extractor died while extracting story {}: {}".format( stories_id, ex)) db.commit() log.info("Done extracting story {}.".format(stories_id)) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::ExtractAndVector' if __name__ == '__main__': app = JobBrokerApp(job_class=ExtractAndVectorJob) app.start_worker()
class FetchMediaPages(AbstractJob): """ Fetch all media's pages (news stories and not) from XML sitemap. Start this worker script by running: ./script/run_in_env.sh ./mediacloud/mediawords/job/sitemap/fetch_media_pages.py """ @classmethod def run_job(cls, media_id: int) -> None: if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) db = connect_to_db() fetch_sitemap_pages_for_media_id(db=db, media_id=media_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::Sitemap::FetchMediaPages' if __name__ == '__main__': app = JobBrokerApp(job_class=FetchMediaPages) app.start_worker()
Start this worker script by running: ./script/run_in_env.sh ./mediacloud/mediawords/job/similarweb/update_audience_data.py """ @classmethod def run_job(cls, media_id: int) -> None: if isinstance(media_id, bytes): media_id = decode_object_from_bytes_if_needed(media_id) media_id = int(media_id) db = connect_to_db() similarweb_client = get_similarweb_client() log.info( "Collecting audience data for media ID {}...".format(media_id)) update(db, media_id, similarweb_client) log.info("Finished collecting audience data for media ID {}".format( media_id)) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::SimilarWeb::UpdateAudienceData' if __name__ == '__main__': app = JobBrokerApp(job_class=SimilarWebUpdateAudienceDataJob) app.start_worker()
db=db, topic_fetch_urls_ids=topic_fetch_urls_ids) except Exception as ex: log.error("Error while fetching URL with ID {}: {}".format( topic_fetch_urls_ids, str(ex))) db.query( """ update topic_fetch_urls set state = %(a)s, message = %(b)s, fetch_date = now() where topic_fetch_urls_id = any(%(c)s) """, { 'a': mediawords.tm.fetch_link.FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids }) db.disconnect() log.info("Finished fetching twitter url") @classmethod def queue_name(cls) -> str: """Set queue name.""" return 'MediaWords::Job::TM::FetchTwitterUrls' if __name__ == '__main__': try: app = JobBrokerApp(job_class=FetchTwitterUrlsJob) app.start_worker() except BaseException as e: print(str(e))
log.info("Fetching annotation for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McNYTLabelsFetchAnnotationJobException( "Story with ID %d was not found." % stories_id) nytlabels = NYTLabelsAnnotator() try: nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McNYTLabelsFetchAnnotationJobException( "Unable to process story $stories_id with NYTLabels: %s" % str(ex)) log.info("Adding story ID %d to the update story tags queue..." % stories_id) NYTLabelsUpdateStoryTagsJob.add_to_queue(stories_id=stories_id) log.info("Finished fetching annotation for story ID %d" % stories_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::NYTLabels::FetchAnnotation' if __name__ == '__main__': app = JobBrokerApp(job_class=NYTLabelsFetchAnnotationJob) app.start_worker()
log.info("Updating tags for story ID %d..." % stories_id) story = db.find_by_id(table='stories', object_id=stories_id) if story is None: raise McCLIFFUpdateStoryTagsJobException( "Story with ID %d was not found." % stories_id) cliff = CLIFFAnnotator() try: cliff.update_tags_for_story(db=db, stories_id=stories_id) except Exception as ex: raise McCLIFFUpdateStoryTagsJobException( "Unable to process story ID %s with CLIFF: %s" % ( stories_id, str(ex), )) log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id) NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id) log.info("Finished updating tags for story ID %d" % stories_id) @classmethod def queue_name(cls) -> str: return 'MediaWords::Job::CLIFF::UpdateStoryTags' if __name__ == '__main__': app = JobBrokerApp(job_class=CLIFFUpdateStoryTagsJob) app.start_worker()