Example #1
0
def process_extracted_story(db: DatabaseHandler, story: dict,
                            extractor_args: PyExtractorArguments) -> None:
    """Do post extraction story processing work by calling update_story_sentences_and_language()."""
    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    log.debug(
        "Updating sentences and language for story {}...".format(stories_id))
    update_story_sentences_and_language(db=db,
                                        story=story,
                                        extractor_args=extractor_args)

    # Extract -> CLIFF -> NYTLabels -> mark_as_processed() chain
    if story_is_english_and_has_sentences(db=db, stories_id=stories_id):
        # If CLIFF annotator is enabled, cliff/update_story_tags job will check whether NYTLabels annotator is enabled,
        # and if it is, will pass the story further to NYTLabels. NYTLabels, in turn, will mark the story as processed.
        log.debug(
            "Adding story {} to CLIFF annotation/tagging queue...".format(
                stories_id))
        JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag'
                  ).add_to_queue(stories_id=stories_id)

    else:
        log.debug(
            "Won't add {} to CLIFF annotation/tagging queue because it's not annotatable with CLIFF"
            .format(stories_id))

        if story_is_english_and_has_sentences(db=db, stories_id=stories_id):
            # If CLIFF annotator is disabled, pass the story to NYTLabels annotator which, if run, will mark the story
            # as processed
            log.debug(
                "Adding story {} to NYTLabels annotation/tagging queue...".
                format(stories_id))
            JobBroker(
                queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag'
            ).add_to_queue(stories_id=stories_id)

        else:
            log.debug(
                "Won't add {} to NYTLabels annotation/tagging queue because it's not annotatable with NYTLabels"
                .format(stories_id))

            # If neither of the annotators are enabled, mark the story as processed ourselves
            log.debug("Marking the story as processed...")
            if not mark_as_processed(db=db, stories_id=stories_id):
                raise McProcessExtractedStoryException(
                    "Unable to mark story ID {} as processed".format(
                        stories_id))
def run_extract_and_vector(stories_id: int,
                           use_cache: bool = False,
                           use_existing: bool = False) -> None:
    """Extract, vector and process a story."""

    global _consecutive_requeues

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    if not stories_id:
        raise McExtractAndVectorException("'stories_id' is not set.")

    db = connect_to_db()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McExtractAndVectorException(
            "Story with ID {} was not found.".format(stories_id))

    if medium_is_locked(db=db, media_id=story['media_id']):
        log.warning(
            "Requeueing job for story {} in locked medium {}...".format(
                stories_id, story['media_id']))
        _consecutive_requeues += 1

        # Prevent spamming these requeue events if the locked media source is the only one in the queue
        if _consecutive_requeues > _SLEEP_AFTER_REQUEUES:
            log.warning(
                "Story extraction job has been requeued more than {} times, waiting before requeueing..."
                .format(_consecutive_requeues))
            time.sleep(1)

        JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id)

        return

    _consecutive_requeues = 0

    log.info("Extracting story {}...".format(stories_id))

    db.begin()

    try:
        extractor_args = PyExtractorArguments(use_cache=use_cache,
                                              use_existing=use_existing)
        extract_and_process_story(db=db,
                                  story=story,
                                  extractor_args=extractor_args)

    except Exception as ex:
        raise McExtractAndVectorException(
            "Extractor died while extracting story {}: {}".format(
                stories_id, ex))

    db.commit()

    log.info("Done extracting story {}.".format(stories_id))
def run_cliff_tags_from_annotation(stories_id: int) -> None:
    """Fetch story's CLIFF annotation and uses it to generate/store tags"""
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFTagsFromAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFTagsFromAnnotationJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagsFromAnnotation()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFTagsFromAnnotationJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag'
              ).add_to_queue(stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)
def run_cliff_fetch_annotation(stories_id: int) -> None:
    """Fetch story's CLIFF annotation."""
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    cliff = CLIFFAnnotatorFetcher()
    try:
        cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex))

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)
Example #5
0
    def store_response(self, db: DatabaseHandler, download: dict,
                       response: Response) -> None:

        download = decode_object_from_bytes_if_needed(download)

        downloads_id = download['downloads_id']
        download_url = download['url']

        log.info(f"Handling download {downloads_id}...")
        log.debug(
            f"(URL of download {downloads_id} which is about to be handled: {download_url})"
        )

        if not response.is_success():
            log.info(
                f"Download {downloads_id} errored: {response.decoded_content()}"
            )
            self._store_failed_download_error_message(db=db,
                                                      download=download,
                                                      response=response)
            return

        supported_content_types_regex = re.compile(
            r'text|html|xml|rss|atom|application/json', flags=re.IGNORECASE)
        if re.search(supported_content_types_regex,
                     response.content_type() or ''):
            content = response.decoded_content()
        else:
            content = '(unsupported content type)'

        db.query(
            """
            UPDATE downloads
            SET url = %(download_url)s
            WHERE downloads_id = %(downloads_id)s
              AND url != %(download_url)s
        """, {
                'downloads_id': downloads_id,
                'download_url': download_url,
            })

        story_ids_to_extract = self.store_download(db=db,
                                                   download=download,
                                                   content=content)

        for stories_id in story_ids_to_extract:
            log.debug(
                f"Adding story {stories_id} for download {downloads_id} to extraction queue..."
            )
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

        log.info(f"Handled download {downloads_id}...")
        log.debug(
            f"(URL of download {downloads_id} that was just handled: {download_url})"
        )
Example #6
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]:
        story_ids = super().add_stories_from_feed(db=db, download=download, content=content)

        # Add a podcast-transcribe-episode job for every newly added story
        for stories_id in story_ids:
            log.info(f"Adding a podcast episode transcribe job for story {stories_id}...")
            JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode').add_to_queue(stories_id=stories_id)

        return story_ids
def run_topics_fetch_link(topic_fetch_urls_id: int,
                          domain_timeout: Optional[int] = None) -> None:
    """Fetch a link for a topic and either match it to an existing story or generate a story from it.

    Almost all of the interesting functionality here happens in fetch_topic_url(). The code here just deals with
    routing, including requeueing responses throttled by mediawords.util.web.user_agent.throttled."""
    global _consecutive_requeues

    if isinstance(topic_fetch_urls_id, bytes):
        topic_fetch_urls_id = decode_object_from_bytes_if_needed(
            topic_fetch_urls_id)
    topic_fetch_urls_id = int(topic_fetch_urls_id)

    if topic_fetch_urls_id is None:
        raise McFetchLinkJobException("'topic_fetch_urls_id' is None.")

    db = connect_to_db()

    # FIXME topics_id could be passed as an argument
    topics_id = db.query(
        """
        SELECT topics_id
        FROM topic_fetch_urls
        WHERE topic_fetch_urls_id = %(topic_fetch_urls_id)s
    """, {
            'topic_fetch_urls_id': topic_fetch_urls_id
        }).flat()[0]

    log.info(
        f"Starting fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}"
    )

    try:
        if not fetch_topic_url_update_state(
                db=db,
                topics_id=topics_id,
                topic_fetch_urls_id=topic_fetch_urls_id,
                domain_timeout=domain_timeout):
            JobBroker(queue_name=QUEUE_NAME).add_to_queue(
                topic_fetch_urls_id=topic_fetch_urls_id)

            _consecutive_requeues += 1
            if _consecutive_requeues > REQUEUES_UNTIL_SLEEP:
                log.info("sleeping after %d consecutive retries ..." %
                         _consecutive_requeues)
                time.sleep(1)

    except Exception as ex:
        # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the
        # "consecutive retries" here
        log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}")
        _consecutive_requeues = 0

    log.info(
        f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}"
    )
def add_us_media_to_sitemap_queue():
    us_media_ids = [
        104828, 1089, 1092, 1095, 1098, 1101, 1104, 1110, 1145, 1149, 1150, 14, 15, 1747, 1750, 1751, 1752, 1755, 18268,
        18710, 18775, 18839, 18840, 19334, 19643, 1, 22088, 25349, 25499, 27502, 2, 40944, 4415, 4419, 4442, 4, 6218,
        623382, 64866, 65, 6, 751082, 7, 8,
    ]
    us_media_ids = sorted(us_media_ids)
    for media_id in us_media_ids:
        log.info("Adding media ID %d" % media_id)
        JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
Example #9
0
    async def add_to_extraction_queue(self, stories_id: int) -> None:

        log.info(f"Adding an extraction job for story {stories_id}...")

        job_broker = JobBroker(
            queue_name='MediaWords::Job::ExtractAndVector',
            rabbitmq_config=RabbitMQConfig(

                # Keep RabbitMQ's timeout smaller than the action's "start_to_close_timeout"
                timeout=60,

                # Disable retries as Temporal will be the one that does all the retrying
                retries=None,
            ),
        )

        # add_to_queue() is not idempotent but it's not a big deal to extract a single story twice
        job_broker.add_to_queue(stories_id=stories_id)

        log.info(f"Done adding an extraction job for story {stories_id}")
Example #10
0
def setup_test_index(db: DatabaseHandler) -> None:
    """
    Run a full Solr import based on the current PostgreSQL database.

    Due to a failsafe built into generate_and_import_data(), the delete of the collection data will fail if there are
    more than 100 million sentences in the index (to prevent accidental deletion of production data).
    """

    queue_all_stories(db)

    JobBroker(queue_name='MediaWords::Job::ImportSolrDataForTesting').run_remotely(full=True, throttle=False)
def add_all_media_to_sitemap_queue(db: DatabaseHandler):
    """Add all media IDs to XML sitemap fetching queue."""
    log.info("Fetching all media IDs...")
    media_ids = db.query("""
        SELECT media_id
        FROM media
        ORDER BY media_id
    """).flat()
    for media_id in media_ids:
        log.info("Adding media ID %d" % media_id)
        JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
Example #12
0
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None:
    """Given a ap story return by get_new_stories(), add it to the database."""
    ap_medium = db.query(
        """
        SELECT *
        FROM media
        WHERE name = %(medium_name)s
    """, {
            'medium_name': AP_MEDIUM_NAME,
        }).hash()
    ap_feed = {
        'media_id': ap_medium['media_id'],
        'name': 'API Feed',
        'active': False,
        'type': 'syndicated',
        'url': 'http://ap.com'
    }
    ap_feed = db.find_or_create('feeds', ap_feed)

    story = {
        'guid': ap_story['guid'],
        'url': ap_story['url'],
        'publish_date': ap_story['publish_date'],
        'title': ap_story['title'],
        'description': ap_story['description'],
        'media_id': ap_medium['media_id']
    }
    story = add_story(db, story, ap_feed['feeds_id'])

    if not story:
        return

    story_download = create_download_for_new_story(db, story, ap_feed)

    download_text = {
        'downloads_id': story_download['downloads_id'],
        'download_text': ap_story['text'],
        'download_text_length': len(ap_story['text'])
    }

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s)
        """, download_text)

    # Send to the extractor for it to do vectorization, language detection, etc.
    JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
        stories_id=story['stories_id'],
        use_existing=True,
    )
Example #13
0
def _extract_story(story: dict) -> None:
    """Process the story through the extractor."""

    if url_has_binary_extension(story['url']):
        return

    if re2.search(r'livejournal.com\/(tag|profile)', story['url'], re2.I):
        return

    JobBroker(queue_name='MediaWords::Job::ExtractAndVector').run_remotely(
        stories_id=story['stories_id'],
        use_cache=True,
        use_existing=True,
    )
def run_podcast_fetch_transcript(
        podcast_episode_transcript_fetches_id: int) -> None:
    """Fetch a completed episode transcripts from Speech API for story."""

    if isinstance(podcast_episode_transcript_fetches_id, bytes):
        podcast_episode_transcript_fetches_id = decode_object_from_bytes_if_needed(
            podcast_episode_transcript_fetches_id)
    podcast_episode_transcript_fetches_id = int(
        podcast_episode_transcript_fetches_id)

    if not podcast_episode_transcript_fetches_id:
        fatal_error("'podcast_episode_transcript_fetches_id' is unset.")

    db = connect_to_db()

    log.info(
        f"Fetching transcript for fetch ID {podcast_episode_transcript_fetches_id}..."
    )

    try:
        stories_id = fetch_store_transcript(
            db=db,
            podcast_episode_transcript_fetches_id=
            podcast_episode_transcript_fetches_id,
        )

        if stories_id:
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

    except McPodcastFetchTranscriptSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}"
        )
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error((f"Fatal / unknown error while fetching transcript "
                     f"for ID {podcast_episode_transcript_fetches_id}: {ex}"))

    log.info(
        f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}"
    )
def add_colombia_media_to_sitemap_queue():
    colombia_media_ids = [
        38871, 40941, 42072, 57482, 58360, 58430, 58660, 59058, 59589, 60338, 61607, 62209, 63889, 63921, 74622, 120254,
        127258, 211343, 277109, 280236, 281924, 282160, 282256, 282463, 282769, 283998, 297900, 324728, 325564, 325966,
        326385, 326782, 328053, 329452, 329735, 330235, 330576, 331318, 331987, 336326, 336339, 336682, 336993, 340969,
        341040, 347037, 347551, 348018, 348021, 348023, 348024, 348026, 348029, 348031, 348032, 348033, 348034, 348035,
        348037, 348038, 348040, 348041, 348043, 348044, 348048, 348049, 348050, 348052, 348054, 348058, 348060, 348061,
        348062, 348063, 348064, 348066, 348067, 348068, 348069, 348070, 348072, 348073, 348074, 348075, 348077, 348078,
        348079, 348081, 348083, 348084, 357882, 359251, 362163, 362287, 362386, 362587, 363868, 467798, 540413, 552466,
        552579, 558121, 559945, 563374, 565190, 565808, 567421, 651490, 651491, 651492, 651493, 651494, 655394, 655395,
        683226, 683288, 683554, 695708, 695709, 695710, 695711, 695712, 695713, 695715, 845114, 849762, 879769, 1180124,
        1195863, 1195913, 1207868, 1208757, 1265854,

        # Extra media sources not in collection
        326186, 855592, 879585, 851767,
    ]
    colombia_media_ids = sorted(colombia_media_ids)
    for media_id in colombia_media_ids:
        log.info("Adding media ID %d" % media_id)
        JobBroker(queue_name='MediaWords::Job::Sitemap::FetchMediaPages').add_to_queue(media_id=media_id)
Example #16
0
    def setUp(self) -> None:
        super().setUp()

        self.db = connect_to_db()

        test_medium = create_test_medium(db=self.db, label='test')
        test_feed = create_test_feed(db=self.db,
                                     label='test',
                                     medium=test_medium)

        # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running
        # at the same time
        self.stories_id = random.randint(1, 2147483647 - 1)

        self.db.query(
            """
            INSERT INTO stories (
                stories_id,
                media_id,
                url,
                guid,
                title,
                description,
                publish_date,
                collect_date,
                full_text_rss
            ) VALUES (
                %(stories_id)s,
                %(media_id)s,
                'http://story.test/',
                'guid://story.test/',
                'story',
                'description',
                '2016-10-15 08:00:00',
                '2016-10-15 10:00:00',
                true
            )
        """, {
                'stories_id': self.stories_id,
                'media_id': test_feed['media_id'],
            })

        # Create missing partitions for "feeds_stories_map"
        self.db.query('SELECT create_missing_partitions()')

        self.db.create(table='feeds_stories_map',
                       insert_hash={
                           'feeds_id': int(test_feed['feeds_id']),
                           'stories_id': self.stories_id,
                       })

        assert os.path.isfile(self.input_media_path(
        )), f"Test media file '{self.input_media_path()}' should exist."

        with open(self.input_media_path(), mode='rb') as f:
            test_data = f.read()

        # noinspection PyUnusedLocal
        def __media_callback(request: HashServer.Request) -> Union[str, bytes]:
            response = "".encode('utf-8')
            response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
            response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode(
                'utf-8')
            response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8')
            response += "\r\n".encode('utf-8')
            response += test_data
            return response

        port = 8080  # Port exposed on docker-compose.tests.yml
        media_path = '/test_media_file'
        pages = {
            media_path: {
                'callback': __media_callback,
            }
        }

        self.hs = HashServer(port=port, pages=pages)
        self.hs.start()

        # Using our hostname as it will be another container that will be connecting to us
        media_url = f'http://{socket.gethostname()}:{port}{media_path}'

        self.db.insert(table='story_enclosures',
                       insert_hash={
                           'stories_id': self.stories_id,
                           'url': media_url,
                           'mime_type': self.input_media_mime_type(),
                           'length': len(test_data),
                       })

        # Add a "podcast-fetch-episode" job
        JobBroker(
            queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue(
                stories_id=self.stories_id)

        total_time = int(self.retries_per_step() *
                         self.seconds_between_retries())

        # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes"
        episodes = None
        for x in range(1, self.retries_per_step() + 1):
            log.info(f"Waiting for episode to appear (#{x})...")

            episodes = self.db.select(table='podcast_episodes',
                                      what_to_select='*').hashes()
            if episodes:
                log.info(f"Episode is here!")
                break

            time.sleep(self.seconds_between_retries())

        assert episodes, f"Episode didn't show up in {total_time} seconds."

        # Wait for "podcast-submit-operation" to submit Speech API operation
        self.transcript_fetches = None
        for x in range(1, self.retries_per_step() + 1):
            log.info(f"Waiting for transcript fetch to appear (#{x})...")

            self.transcript_fetches = self.db.select(
                table='podcast_episode_transcript_fetches',
                what_to_select='*').hashes()

            if self.transcript_fetches:
                log.info(f"Transcript fetch is here!")
                break

            time.sleep(self.seconds_between_retries())

        assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds."
Example #17
0
    if topics_id is None:
        raise McExtractStoryLinksJobException("'topics_id' is None.")

    stories_id = int(stories_id)
    topics_id = int(topics_id)

    db = connect_to_db()

    log.info("Start fetching extracting links for stories_id %d topics_id %d" %
             (stories_id, topics_id))

    try:
        extract_links_for_topic_story(db=db,
                                      stories_id=stories_id,
                                      topics_id=topics_id)

    except Exception as ex:
        log.error("Error while processing story {}: {}".format(stories_id, ex))
        raise McExtractStoryLinksJobException(
            "Unable to process story {}: {}".format(stories_id,
                                                    traceback.format_exc()))

    log.info(
        "Finished fetching extracting links for stories_id %d topics_id %d" %
        (stories_id, topics_id))


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::TM::ExtractStoryLinks')
    app.start_worker(handler=run_topics_extract_story_links)
        stories_id = decode_object_from_bytes_if_needed(stories_id)

    if stories_id is None:
        raise McCLIFFFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    cliff = CLIFFAnnotatorFetcher()
    try:
        cliff.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFFetchAnnotationJobException("Unable to process story $stories_id with CLIFF: %s" % str(ex))

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotation')
    app.start_worker(handler=run_cliff_fetch_annotation)
 def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None:
     JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript'
               ).add_to_queue(podcast_episode_transcript_fetches_id=
                              podcast_episode_transcript_fetches_id, )
    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFTagsFromAnnotationJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagsFromAnnotation()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFTagsFromAnnotationJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotationAndTag'
              ).add_to_queue(stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::FetchAnnotationAndTag')
    app.start_worker(handler=run_cliff_tags_from_annotation)
            db=db,
            podcast_episode_transcript_fetches_id=
            podcast_episode_transcript_fetches_id,
        )

        if stories_id:
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

    except McPodcastFetchTranscriptSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}"
        )
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error((f"Fatal / unknown error while fetching transcript "
                     f"for ID {podcast_episode_transcript_fetches_id}: {ex}"))

    log.info(
        f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript')
    app.start_worker(handler=run_podcast_fetch_transcript)
    try:
        if not fetch_topic_url_update_state(
                db=db,
                topics_id=topics_id,
                topic_fetch_urls_id=topic_fetch_urls_id,
                domain_timeout=domain_timeout):
            JobBroker(queue_name=QUEUE_NAME).add_to_queue(
                topic_fetch_urls_id=topic_fetch_urls_id)

            _consecutive_requeues += 1
            if _consecutive_requeues > REQUEUES_UNTIL_SLEEP:
                log.info("sleeping after %d consecutive retries ..." %
                         _consecutive_requeues)
                time.sleep(1)

    except Exception as ex:
        # Error has already been logged by fetch_topic_url_update_state(), so we only need to work out the
        # "consecutive retries" here
        log.error(f"Fetching URL for ID {topic_fetch_urls_id} failed: {ex}")
        _consecutive_requeues = 0

    log.info(
        f"Finished fetch for topic {topics_id}, topic_fetch_url {topic_fetch_urls_id}"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name=QUEUE_NAME)
    app.start_worker(handler=run_topics_fetch_link)
Example #23
0
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info("Finished generating word2vec model for snapshot %d." %
             snapshots_id)


if __name__ == '__main__':
    app = JobBroker(
        queue_name='MediaWords::Job::Word2vec::GenerateSnapshotModel')
    app.start_worker(handler=run_word2vec_generate_snapshot_model)
Example #24
0
        # FIXME could be passed as an argument
        topics_id = db.query("""
            SELECT topics_id
            FROM timespans
            WHERE timespans_id = %(timespans_id)s
        """, {
            'timespans_id': timespans_id,
        }).flat()[0]

        log.info(f"Generating maps for topic {topics_id}, timespan {timespans_id}")
        generate_and_store_maps(
            db=db,
            topics_id=topics_id,
            timespans_id=timespans_id,
            memory_limit_mb=_memory_limit_mb,
        )


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run topics map worker.")
    parser.add_argument("-m", "--memory_limit_mb", type=int, required=True,
                        help="Memory limit (MB) for Java subprocess")
    args = parser.parse_args()

    _memory_limit_mb = args.memory_limit_mb
    assert _memory_limit_mb, "Memory limit is not set (no idea what to set -Xmx to)."

    app = JobBroker(queue_name=QUEUE_NAME)
    app.start_worker(handler=run_job)
    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McCLIFFUpdateStoryTagsJobException(
            "Story with ID %d was not found." % stories_id)

    cliff = CLIFFTagger()
    try:
        cliff.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McCLIFFUpdateStoryTagsJobException(
            "Unable to process story ID %s with CLIFF: %s" % (
                stories_id,
                str(ex),
            ))

    # log.info("Adding story ID %d to NYTLabels fetch queue..." % stories_id)
    # JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation').add_to_queue(stories_id=stories_id)
    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::CLIFF::UpdateStoryTags')
    app.start_worker(handler=run_cliff_update_story_tags)
Example #26
0
    if stories_id is None:
        raise McNYTLabelsFetchAnnotationJobException("'stories_id' is None.")

    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Fetching annotation for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McNYTLabelsFetchAnnotationJobException("Story with ID %d was not found." % stories_id)

    nytlabels = NYTLabelsAnnotatorFetcher()
    try:
        nytlabels.annotate_and_store_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McNYTLabelsFetchAnnotationJobException(
            "Unable to process story $stories_id with NYTLabels: %s" % str(ex)
        )

    log.info("Adding story ID %d to the update story tags queue..." % stories_id)
    JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags').add_to_queue(stories_id=stories_id)

    log.info("Finished fetching annotation for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::NYTLabels::FetchAnnotation')
    app.start_worker(handler=run_nytlabels_fetch_annotation)
Example #27
0
log = create_logger(__name__)


async def _start_workflow(stories_id: int) -> None:
    log.info(f"Starting a workflow for story {stories_id}...")

    client = workflow_client()
    workflow: PodcastTranscribeWorkflow = client.new_workflow_stub(
        cls=PodcastTranscribeWorkflow,
        workflow_options=WorkflowOptions(workflow_id=str(stories_id)),
    )

    # Fire and forget as the workflow will do everything (including adding a extraction job) itself
    await WorkflowClient.start(workflow.transcribe_episode, stories_id)

    log.info(f"Started a workflow for story {stories_id}...")


def run_podcast_fetch_episode(stories_id: int) -> None:
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    asyncio.run(_start_workflow(stories_id=stories_id))


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode')
    app.start_worker(handler=run_podcast_fetch_episode)
Example #28
0
log = create_logger(__name__)


class McFetchTwitterUrlsJobException(Exception):
    """Exceptions dealing with job setup and routing."""
    pass


def run_topics_fetch_twitter_urls(topic_fetch_urls_ids: list):
    """Fetch a set of twitter urls from the twitter api and add each as a topic story if it matches.

    All of the interesting logic is in mediawords.tm.fetch_twitter_urls."""
    if topic_fetch_urls_ids is None:
        raise McFetchTwitterUrlsJobException("'topic_fetch_urls_ids' is None.")

    log.info("Start fetch twitter urls for %d topic_fetch_urls" %
             len(topic_fetch_urls_ids))

    db = connect_to_db()

    fetch_twitter_urls_update_state(db=db,
                                    topic_fetch_urls_ids=topic_fetch_urls_ids)

    log.info("Finished fetching twitter urls")


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::TM::FetchTwitterUrls')
    app.start_worker(handler=run_topics_fetch_twitter_urls)
Example #29
0
                add_to_queue_at
            ) VALUES (
                %(podcast_episodes_id)s,
                NOW() + INTERVAL %(add_to_queue_interval)s
            )
        """, {
                'podcast_episodes_id': episode.podcast_episodes_id,
                'add_to_queue_interval': add_to_queue_interval,
            })

    except McPodcastSubmitOperationSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to submit podcast episode for story {stories_id}: {ex}")
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error(
            f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}"
        )

    log.info(
        f"Done submitting story's {stories_id} podcast episode for transcription"
    )


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation')
    app.start_worker(handler=run_podcast_submit_operation)
Example #30
0
    stories_id = int(stories_id)

    db = connect_to_db()

    log.info("Updating tags for story ID %d..." % stories_id)

    story = db.find_by_id(table='stories', object_id=stories_id)
    if story is None:
        raise McNYTLabelsUpdateStoryTagsJobException(
            "Story with ID %d was not found." % stories_id)

    nytlabels = NYTLabelsTagger()
    try:
        nytlabels.update_tags_for_story(db=db, stories_id=stories_id)
    except Exception as ex:
        raise McNYTLabelsUpdateStoryTagsJobException(
            "Unable to process story ID %d with NYTLabels: %s" % (
                stories_id,
                str(ex),
            ))

    log.info("Marking story ID %d as processed..." % stories_id)
    mark_as_processed(db=db, stories_id=stories_id)

    log.info("Finished updating tags for story ID %d" % stories_id)


if __name__ == '__main__':
    app = JobBroker(queue_name='MediaWords::Job::NYTLabels::UpdateStoryTags')
    app.start_worker(handler=run_nytlabels_update_story_tags)