Exemple #1
0
def connect_to_db() -> DatabaseHandler:
    """Connect to PostgreSQL."""

    db_config = CommonConfig.database()
    retries_config = db_config.retries()

    assert retries_config.max_attempts() > 0, "max_tries can't be negative."

    db = None

    for attempt in range(1, retries_config.max_attempts() + 1):

        try:

            log.debug("Connecting to PostgreSQL...")

            db = DatabaseHandler(
                host=db_config.hostname(),
                port=db_config.port(),
                username=db_config.username(),
                password=db_config.password(),
                database=db_config.database_name(),
            )
            if not db:
                raise ValueError("Returned value is None.")

            # Return the database handler upon successful connection
            break

        except Exception as ex:

            error_message = "Unable to connect to %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': db_config.username(),
                'host': db_config.hostname(),
                'port': db_config.port(),
                'database': db_config.database_name(),
                'exception': str(ex),
            }

            log.error(error_message)

            if attempt < retries_config.max_attempts():
                log.info(
                    f"Will retry for #{attempt} time in {retries_config.sleep_between_attempts()} seconds..."
                )
                time.sleep(retries_config.sleep_between_attempts())

            else:
                log.info("Out of retries, giving up and exiting...")

                # Don't throw any exceptions because they might be caught by
                # the try-catch block, and so the caller will just assume that
                # there was something wrong with the input data and proceed
                # with processing next item in the job queue (e.g. the next
                # story). Instead, just quit and wait for someone to restart
                # the whole app that requires database access.
                fatal_error(error_message)

    return db
Exemple #2
0
def run_podcast_submit_operation(stories_id: int) -> None:
    """Submit a podcast episode to the Speech API."""

    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    db = connect_to_db()

    log.info(
        f"Submitting story's {stories_id} podcast episode for transcription..."
    )

    try:
        episode = get_podcast_episode(db=db, stories_id=stories_id)
        speech_operation_id = submit_transcribe_operation(episode=episode)

        db.query(
            """
            UPDATE podcast_episodes
            SET speech_operation_id = %(speech_operation_id)s
            WHERE podcast_episodes_id = %(podcast_episodes_id)s
        """, {
                'podcast_episodes_id': episode.podcast_episodes_id,
                'speech_operation_id': speech_operation_id,
            })

        add_to_queue_interval = f"{int(episode.duration + ADD_TO_QUEUE_AT_DURATION_MULTIPLIER)} seconds"
        db.query(
            """
            INSERT INTO podcast_episode_transcript_fetches (
                podcast_episodes_id,
                add_to_queue_at
            ) VALUES (
                %(podcast_episodes_id)s,
                NOW() + INTERVAL %(add_to_queue_interval)s
            )
        """, {
                'podcast_episodes_id': episode.podcast_episodes_id,
                'add_to_queue_interval': add_to_queue_interval,
            })

    except McPodcastSubmitOperationSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to submit podcast episode for story {stories_id}: {ex}")
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error(
            f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}"
        )

    log.info(
        f"Done submitting story's {stories_id} podcast episode for transcription"
    )
Exemple #3
0
    def fetch_annotation_for_story(self, db: DatabaseHandler,
                                   stories_id: int) -> Union[dict, list, None]:
        """Fetch the annotation from key-value store for the story, or None if story is not annotated."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        if not self.story_is_annotated(db=db, stories_id=stories_id):
            log.warning("Story %d is not annotated." % stories_id)
            return None

        json = self.__postgresql_store.fetch_content(db=db,
                                                     object_id=stories_id)
        if json is None:
            raise McJSONAnnotatorException(
                "Fetched annotation is undefined or empty for story %d." %
                stories_id)

        json = json.decode('utf-8')

        try:
            annotation = decode_json(json)
            if annotation is None:
                raise McJSONAnnotatorException(
                    "Annotation is None after decoding from JSON.")
        except Exception as ex:
            raise McJSONAnnotatorException(
                "Unable to parse annotation JSON for story %d: %s\nString JSON: %s"
                % (
                    stories_id,
                    str(ex),
                    json,
                ))

        try:
            annotation = self._preprocess_stored_annotation(annotation)
            if annotation is None:
                raise McJSONAnnotatorException(
                    "Annotation is None after preprocessing.")
        except Exception as ex:
            fatal_error(
                "Unable to preprocess stored annotation for story %d: %s\nString JSON: %s"
                % (
                    stories_id,
                    str(ex),
                    json,
                ))

        return annotation
def run_job() -> None:
    # Start some background processes to see if they get killed properly
    # noinspection PyUnusedLocal
    bg_process_1 = subprocess.Popen(["sleep", "30"])
    # noinspection PyUnusedLocal
    bg_process_2 = subprocess.Popen(["sleep", "30"])

    # Wait for the children processes to fire up
    time.sleep(1)

    fatal_error(f"Failing worker.")
Exemple #5
0
    def __init__(self):
        """Constructor."""

        kvs_table_name = self._postgresql_raw_annotations_table()
        if kvs_table_name is None or len(kvs_table_name) == 0:
            fatal_error("Annotator's key-value store table name is not set.")

        compression_method = KeyValueStore.Compression.GZIP
        if self.__USE_BZIP:
            compression_method = KeyValueStore.Compression.BZIP2

        self.__postgresql_store = PostgreSQLStore(table=kvs_table_name, compression_method=compression_method)

        log.debug("Will read / write annotator results to PostgreSQL table: %s" % kvs_table_name)
def run_facebook_fetch_story_stats(stories_id: int) -> None:
    """Fetch Facebook stats for a story in the job queue; throw exception on soft errors, sys.exit(1) on hard errors."""

    if not stories_id:
        fatal_error("'stories_id' is not set.")

    stories_id = int(stories_id)

    if not FacebookConfig.is_enabled():
        fatal_error("Facebook API processing is not enabled.")

    db = None
    try:
        db = connect_to_db()
    except Exception as ex:
        # On connection errors, we actually want to die and wait to be (auto)restarted because otherwise we will
        # continue on fetching new jobs from RabbitMQ and failing all of them
        fatal_error(f"Unable to connect to PostgreSQL: {ex}")

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        # If one or more stories don't exist, that's okay and we can just fail this job
        raise Exception(f"Story with ID {stories_id} does not exist.")

    log.info(f"Fetching story stats for story {stories_id}...")

    try:
        get_and_store_story_stats(db=db, story=story)

    except McFacebookSoftFailureException as ex:
        # On soft errors, just raise the exception further as we have reason to believe that the request will succeed on
        # other stories in the job queue
        log.error(f"Error while fetching stats for story {stories_id}: {ex}")
        raise ex

    except McFacebookHardFailureException as ex:
        # On hard errors, stop the whole worker as we most likely can't continue without a developer having a look into
        # what's happening
        fatal_error(
            f"Fatal error while fetching stats for story {stories_id}: {ex}")

    except Exception as ex:
        # On unknown exceptions, also go for sys.exit(1) as we don't really know what happened as they shouldn't be
        # thrown anyway
        fatal_error(
            f"Unknown exception while fetching stats for story {stories_id}: {ex}"
        )

    log.info(f"Done fetching story stats for story {stories_id}.")
def run_podcast_fetch_transcript(
        podcast_episode_transcript_fetches_id: int) -> None:
    """Fetch a completed episode transcripts from Speech API for story."""

    if isinstance(podcast_episode_transcript_fetches_id, bytes):
        podcast_episode_transcript_fetches_id = decode_object_from_bytes_if_needed(
            podcast_episode_transcript_fetches_id)
    podcast_episode_transcript_fetches_id = int(
        podcast_episode_transcript_fetches_id)

    if not podcast_episode_transcript_fetches_id:
        fatal_error("'podcast_episode_transcript_fetches_id' is unset.")

    db = connect_to_db()

    log.info(
        f"Fetching transcript for fetch ID {podcast_episode_transcript_fetches_id}..."
    )

    try:
        stories_id = fetch_store_transcript(
            db=db,
            podcast_episode_transcript_fetches_id=
            podcast_episode_transcript_fetches_id,
        )

        if stories_id:
            JobBroker(
                queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
                    stories_id=stories_id)

    except McPodcastFetchTranscriptSoftException as ex:
        # Soft exceptions
        log.error(
            f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}"
        )
        raise ex

    except Exception as ex:
        # Hard and other exceptions
        fatal_error((f"Fatal / unknown error while fetching transcript "
                     f"for ID {podcast_episode_transcript_fetches_id}: {ex}"))

    log.info(
        f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}"
    )
Exemple #8
0
    def annotate_and_store_for_story(self, db: DatabaseHandler, stories_id: int) -> None:
        """Run the annotation for the story, store results in key-value store."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        if self.story_is_annotated(db=db, stories_id=stories_id):
            log.warning("Story %d is already annotated, so I will overwrite it." % stories_id)

        if not self.story_is_annotatable(db=db, stories_id=stories_id):
            log.warning("Story %d is not annotatable." % stories_id)
            return

        story_sentences = db.query("""
            SELECT story_sentences_id, sentence_number, sentence
            FROM story_sentences
            WHERE stories_id = %(stories_id)s
            ORDER BY sentence_number
        """, {'stories_id': stories_id}).hashes()

        if story_sentences is None:
            raise McJSONAnnotatorException("Unable to fetch story sentences for story %s." % stories_id)

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Perl
        if isinstance(story_sentences, dict):
            story_sentences = [story_sentences]

        log.info("Annotating story's %d concatenated sentences..." % stories_id)

        sentences_concat_text = ' '.join(s['sentence'] for s in story_sentences)
        annotation = self.__annotate_text(sentences_concat_text)
        if annotation is None:
            raise McJSONAnnotatorException(
                "Unable to annotate story sentences concatenation for story %d." % stories_id)

        json_annotation = None
        try:
            json_annotation = encode_json(annotation)
            if json_annotation is None:
                raise McJSONAnnotatorException("JSON annotation is None for annotation %s." % str(annotation))
        except Exception as ex:
            fatal_error("Unable to encode annotation to JSON: %s\nAnnotation: %s" % (str(ex), str(annotation)))

        log.info("Done annotating story's %d concatenated sentences." % stories_id)

        log.debug("JSON length: %d" % len(json_annotation))

        log.info("Storing annotation results for story %d..." % stories_id)
        try:
            self.__postgresql_store.store_content(db=db, object_id=stories_id, content=json_annotation.encode('utf-8'))
        except Exception as ex:
            fatal_error("Unable to store annotation result: %s\nJSON annotation: %s" % (str(ex), json_annotation))
        log.info("Done storing annotation results for story %d." % stories_id)
Exemple #9
0
    def store_annotation_for_story(
            self, db: DatabaseHandler, stories_id: int,
            annotation: Union[dict, list, None]) -> None:
        """Store annotation for a story."""

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = decode_object_from_bytes_if_needed(annotation)

        json_annotation = None
        try:
            json_annotation = encode_json(annotation)
            if json_annotation is None:
                raise McJSONAnnotationStoreException(
                    "JSON annotation is None for annotation %s." %
                    str(annotation))
        except Exception as ex:
            fatal_error(
                "Unable to encode annotation to JSON: %s\nAnnotation: %s" %
                (str(ex), str(annotation)))

        log.debug("JSON length: %d" % len(json_annotation))

        log.info("Storing annotation results for story %d..." % stories_id)
        try:
            self.__postgresql_store.store_content(
                db=db,
                object_id=stories_id,
                content=json_annotation.encode('utf-8'))
        except Exception as ex:
            fatal_error(
                "Unable to store annotation result: %s\nJSON annotation: %s" %
                (str(ex), json_annotation))
        log.info("Done storing annotation results for story %d." % stories_id)
#!/usr/bin/env python3

from mediawords.job import JobBroker
from mediawords.util.process import fatal_error

from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue


class JobBrokerFetchTranscriptQueue(AbstractFetchTranscriptQueue):
    """Add fetch transcript jobs to job broker's queue."""
    def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None:
        JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript'
                  ).add_to_queue(podcast_episode_transcript_fetches_id=
                                 podcast_episode_transcript_fetches_id, )


if __name__ == '__main__':
    try:
        fetch_transcript_queue = JobBrokerFetchTranscriptQueue()
        poll_for_due_operations(fetch_transcript_queue=fetch_transcript_queue)
    except Exception as ex:
        # Hard and unknown errors (no soft errors here)
        fatal_error(f"Unable to poll for due operations: {ex}")
Exemple #11
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
Exemple #12
0
    def update_tags_for_story(self, db: DatabaseHandler,
                              stories_id: int) -> None:
        """Add version, country and story tags for story."""

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.__annotation_store.fetch_annotation_for_story(
            db=db, stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotationTaggerException(
                "Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (
                stories_id,
                str(ex),
            ))

        if tags is None:
            raise McJSONAnnotationTaggerException(
                "Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (
            stories_id,
            str(tags),
        ))

        db.begin()

        unique_tag_sets_names = set()
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            unique_tag_sets_names.add(tag_sets_name)

        # Delete old tags the story might have under a given tag set
        db.query(
            """
            DELETE FROM stories_tags_map
            WHERE stories_id = %(stories_id)s
              AND tags_id IN (
                SELECT tags_id
                FROM tags
                WHERE tag_sets_id IN (
                  SELECT tag_sets_id
                  FROM tag_sets
                  WHERE name = ANY(%(tag_sets_names)s)
                )
              )
        """, {
                'stories_id': stories_id,
                'tag_sets_names': list(unique_tag_sets_names)
            })

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Find or create a tag set
            db_tag_set = db.select(table='tag_sets',
                                   what_to_select='*',
                                   condition_hash={
                                       'name': tag_sets_name
                                   }).hash()
            if db_tag_set is None:
                db.query(
                    """
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                        'name': tag_sets_name,
                        'label': tag.tag_sets_label,
                        'description': tag.tag_sets_description
                    })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={
                                           'name': tag_sets_name
                                       }).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Find or create tag
            db_tag = db.select(table='tags',
                               what_to_select='*',
                               condition_hash={
                                   'tag_sets_id': tag_sets_id,
                                   'tag': tags_name,
                               }).hash()
            if db_tag is None:
                db.query(
                    """
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                        'tag_sets_id': tag_sets_id,
                        'tag': tags_name,
                        'label': tag.tags_label,
                        'description': tag.tags_description,
                    })
                db_tag = db.select(table='tags',
                                   what_to_select='*',
                                   condition_hash={
                                       'tag_sets_id': tag_sets_id,
                                       'tag': tags_name,
                                   }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            #
            # (partitioned table's INSERT trigger will take care of conflicts)
            #
            # Not using db.create() because it tests last_inserted_id, and on duplicates there would be no such
            # "last_inserted_id" set.
            db.query(
                """
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
            """, {
                    'stories_id': stories_id,
                    'tags_id': tags_id,
                })

        db.commit()
Exemple #13
0
    def update_tags_for_story(self, db: DatabaseHandler, stories_id: int) -> None:
        """Add version, country and story tags for story."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.fetch_annotation_for_story(db=db, stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotatorException("Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (stories_id, str(ex),))

        if tags is None:
            raise McJSONAnnotatorException("Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (stories_id, str(tags),))

        db.begin()

        unique_tag_sets_names = set()
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name)
            unique_tag_sets_names.add(tag_sets_name)

        # Delete old tags the story might have under a given tag set
        db.query("""
            DELETE FROM stories_tags_map
            WHERE stories_id = %(stories_id)s
              AND tags_id IN (
                SELECT tags_id
                FROM tags
                WHERE tag_sets_id IN (
                  SELECT tag_sets_id
                  FROM tag_sets
                  WHERE name = ANY(%(tag_sets_names)s)
                )
              )
        """, {'stories_id': stories_id, 'tag_sets_names': list(unique_tag_sets_names)})

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Find or create a tag set
            db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={'name': tag_sets_name}).hash()
            if db_tag_set is None:
                db.query("""
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                    'name': tag_sets_name,
                    'label': tag.tag_sets_label,
                    'description': tag.tag_sets_description
                })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={'name': tag_sets_name}).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Find or create tag
            db_tag = db.select(table='tags', what_to_select='*', condition_hash={
                'tag_sets_id': tag_sets_id,
                'tag': tags_name,
            }).hash()
            if db_tag is None:
                db.query("""
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                    'tag_sets_id': tag_sets_id,
                    'tag': tags_name,
                    'label': tag.tags_label,
                    'description': tag.tags_description,
                })
                db_tag = db.select(table='tags', what_to_select='*', condition_hash={
                    'tag_sets_id': tag_sets_id,
                    'tag': tags_name,
                }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            # (partitioned table's INSERT trigger will take care of conflicts)
            #
            # db.create() can't be used here because:
            #
            # 1) Master table for partitioned table might not have a primary key itself, only the partitions do --
            #    FIXME maybe master tables should have primary keys? Or let's wait for when we move to PostgreSQL 10+.
            #
            # 2) Partitioned table's INSERT trigger doesn't return last_inserted_id which db.create() requires
            #    FIXME there might be a way for it to return the inserted row
            #
            db.query("""
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
            """, {
                'stories_id': stories_id,
                'tags_id': tags_id,
            })

        db.commit()
Exemple #14
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results
Exemple #15
0
    def update_tags_for_story(self, db: DatabaseHandler,
                              stories_id: int) -> None:
        """Add version, country and story tags for story."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.fetch_annotation_for_story(db=db,
                                                     stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotatorException(
                "Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (
                stories_id,
                str(ex),
            ))

        if tags is None:
            raise McJSONAnnotatorException(
                "Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (
            stories_id,
            str(tags),
        ))

        db.begin()

        # Delete old tags the story might have under a given tag set
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            db.query(
                """
                DELETE FROM stories_tags_map
                    USING tags, tag_sets
                WHERE stories_tags_map.tags_id = tags.tags_id
                  AND tags.tag_sets_id = tag_sets.tag_sets_id
                  AND stories_tags_map.stories_id = %(stories_id)s
                  AND tag_sets.name = %(tag_sets_name)s
            """, {
                    'stories_id': stories_id,
                    'tag_sets_name': tag_sets_name
                })

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Create tag set
            db_tag_set = db.select(table='tag_sets',
                                   what_to_select='*',
                                   condition_hash={
                                       'name': tag_sets_name
                                   }).hash()
            if db_tag_set is None:
                db.query(
                    """
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                        'name': tag_sets_name,
                        'label': tag.tag_sets_label,
                        'description': tag.tag_sets_description
                    })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={
                                           'name': tag_sets_name
                                       }).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Create tag
            db_tag = db.select(table='tags',
                               what_to_select='*',
                               condition_hash={
                                   'tag_sets_id': tag_sets_id,
                                   'tag': tags_name,
                               }).hash()
            if db_tag is None:
                db.query(
                    """
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                        'tag_sets_id': tag_sets_id,
                        'tag': tags_name,
                        'label': tag.tags_label,
                        'description': tag.tags_description,
                    })
                db_tag = db.select(table='tags',
                                   what_to_select='*',
                                   condition_hash={
                                       'tag_sets_id': tag_sets_id,
                                       'tag': tags_name,
                                   }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            db.query(
                """
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
                ON CONFLICT (stories_id, tags_id) DO NOTHING
            """, {
                    'stories_id': stories_id,
                    'tags_id': tags_id,
                })

        db.commit()
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response