Example #1
0
    def test_extract(self) -> None:
        """Test extract()."""
        db = self.db()

        html = '<script>ignore</script><p>foo</p>'
        mediawords.dbi.downloads.store_content(db, self.test_download, html)
        result = mediawords.dbi.downloads.extract(db=db,
                                                  download=self.test_download)

        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'

        mediawords.dbi.downloads.store_content(db, self.test_download, html)
        mediawords.dbi.downloads.extract(
            db=db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        mediawords.dbi.downloads.store_content(db, self.test_download, 'bar')
        result = mediawords.dbi.downloads.extract(
            db=db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'
Example #2
0
def extract(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> dict:
    """Extract the content for the given download.

    Arguments:
    db - db handle
    download - download dict from db
    use_cache - get and set results in extractor cache

    Returns:
    see extract_content() below

    """
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    if extractor_args.use_cache():
        log.debug("Fetching cached extractor results for download {}...".format(downloads_id))
        results = _get_extractor_results_cache(db, download)
        if results is not None:
            return results

    log.debug("Fetching content for download {}...".format(downloads_id))
    content = fetch_content(db, download)

    log.debug("Extracting {} characters of content for download {}...".format(len(content), downloads_id))
    results = extract_content(content)
    log.debug(
        "Done extracting {} characters of content for download {}.".format(len(content), downloads_id))

    if extractor_args.use_cache():
        log.debug("Caching extractor results for download {}...".format(downloads_id))
        _set_extractor_results_cache(db, download, results)

    return results
Example #3
0
def extract_and_create_download_text(
        db: DatabaseHandler, download: dict,
        extractor_args: PyExtractorArguments) -> dict:
    """Extract the download and create a download_text from the extracted download."""
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    log.debug("Extracting download {}...".format(downloads_id))
    extraction_result = extract(db=db,
                                download=download,
                                extractor_args=extractor_args)
    log.debug("Done extracting download {}.".format(downloads_id))

    download_text = None
    if extractor_args.use_existing():
        log.debug(
            "Fetching download text for download {}...".format(downloads_id))
        download_text = db.query(
            """
            SELECT *
            FROM download_texts
            WHERE downloads_id = %(downloads_id)s
        """, {
                'downloads_id': downloads_id
            }).hash()

    if download_text is None:
        log.debug(
            "Creating download text for download {}...".format(downloads_id))
        download_text = create(db=db,
                               download=download,
                               extract=extraction_result)

    return download_text
Example #4
0
def get_extracted_html(db: DatabaseHandler, story: dict) -> str:
    """Get the extracted html for the story.

    We don't store the extracted html of a story, so we have to get the first download assoicated with the story
    and run the extractor on it.

    """
    download = db.query(
        """
        with d as (
            select * from downloads
                where
                    stories_id = %(a)s and
                    type = 'content' and
                    state = 'success'
        ) -- goofy cte to avoid bad query plan

        select * from d order by downloads_id limit 1
        """, {
            'a': story['stories_id']
        }).hash()

    extractor_results = mediawords.dbi.downloads.extract(
        db, download, PyExtractorArguments(use_cache=True))
    return extractor_results['extracted_html']
Example #5
0
def extract(
    db: DatabaseHandler,
    download: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> dict:
    """Extract the content for the given download.

    Arguments:
    db - db handle
    download - download dict from db
    use_cache - get and set results in extractor cache

    Returns:
    see extract_content() below

    """
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    if extractor_args.use_cache():
        log.debug(
            "Fetching cached extractor results for download {}...".format(
                downloads_id))
        results = _get_extractor_results_cache(db, download)
        if results is not None:
            return results

    log.debug("Fetching content for download {}...".format(downloads_id))
    content = fetch_content(db, download)

    log.debug("Extracting {} characters of content for download {}...".format(
        len(content), downloads_id))
    results = extract_content(content)
    log.debug(
        "Done extracting {} characters of content for download {}.".format(
            len(content), downloads_id))

    if extractor_args.use_cache():
        log.debug("Caching extractor results for download {}...".format(
            downloads_id))
        _set_extractor_results_cache(db, download, results)

    return results
Example #6
0
    def test_extract_and_create_download_text(self):
        download_text = mediawords.dbi.downloads.extract_and_create_download_text(
            db=self.db(),
            download=self.test_download,
            extractor_args=PyExtractorArguments(),
        )

        assert download_text
        assert download_text['download_text'] == 'foo.'
        assert download_text['downloads_id'] == self.test_download[
            'downloads_id']
Example #7
0
    def run_job(cls, stories_id: int, use_cache: bool = False) -> None:

        # MC_REWRITE_TO_PYTHON: remove after Python rewrite
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)
        stories_id = int(stories_id)

        if not stories_id:
            raise McExtractAndVectorException("'stories_id' is not set.")

        db = connect_to_db()

        story = db.find_by_id(table='stories', object_id=stories_id)
        if not story:
            raise McExtractAndVectorException(
                "Story with ID {} was not found.".format(stories_id))

        if medium_is_locked(db=db, media_id=story['media_id']):
            log.warning(
                "Requeueing job for story {} in locked medium {}...".format(
                    stories_id, story['media_id']))
            ExtractAndVectorJob._consecutive_requeues += 1

            # Prevent spamming these requeue events if the locked media source is the only one in the queue
            if ExtractAndVectorJob._consecutive_requeues > ExtractAndVectorJob._SLEEP_AFTER_REQUEUES:
                log.warning(
                    "Story extraction job has been requeued more than {} times, waiting before requeueing..."
                    .format(ExtractAndVectorJob._consecutive_requeues))
                time.sleep(1)

            ExtractAndVectorJob.add_to_queue(stories_id=stories_id)

            return

        ExtractAndVectorJob._consecutive_requeues = 0

        log.info("Extracting story {}...".format(stories_id))

        db.begin()

        try:
            extractor_args = PyExtractorArguments(use_cache=use_cache)
            extract_and_process_story(db=db,
                                      story=story,
                                      extractor_args=extractor_args)

        except Exception as ex:
            raise McExtractAndVectorException(
                "Extractor died while extracting story {}: {}".format(
                    stories_id, ex))

        db.commit()

        log.info("Done extracting story {}.".format(stories_id))
Example #8
0
def _extract_story(db: DatabaseHandler, story: dict) -> None:
    """Process the story through the extractor."""

    if url_has_binary_extension(story['url']):
        return

    if re2.search(r'livejournal.com\/(tag|profile)', story['url'], re2.I):
        return

    extractor_args = PyExtractorArguments(use_cache=True, use_existing=True, no_dedup_sentences=False)
    mediawords.dbi.stories.stories.extract_and_process_story(db=db, story=story, extractor_args=extractor_args)
Example #9
0
def extract_and_process_story(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Extract all of the downloads for the given story and then call process_extracted_story()."""

    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    use_transaction = not db.in_transaction()
    if use_transaction:
        db.begin()

    log.debug("Fetching downloads for story {}...".format(stories_id))
    downloads = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND type = 'content'
          AND state = 'success'
        ORDER BY downloads_id ASC
    """, {
            'stories_id': stories_id
        }).hashes()

    # MC_REWRITE_TO_PYTHON: Perlism
    if downloads is None:
        downloads = []

    for download in downloads:
        log.debug("Extracting download {} for story {}...".format(
            download['downloads_id'], stories_id))
        extract_and_create_download_text(db=db,
                                         download=download,
                                         extractor_args=extractor_args)

    log.debug("Processing extracted story {}...".format(stories_id))
    process_extracted_story(db=db, story=story, extractor_args=extractor_args)

    if use_transaction:
        db.commit()
Example #10
0
def process_download_for_extractor(
    db: DatabaseHandler,
    download: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id,
                                         download['url']))

    extract_and_create_download_text(db=db,
                                     download=download,
                                     extractor_args=extractor_args)

    has_remaining_download = db.query(
        """
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {
            'stories_id': stories_id
        }).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db,
                                story=story,
                                extractor_args=extractor_args)
Example #11
0
def process_extracted_story(db: DatabaseHandler, story: dict, extractor_args: PyExtractorArguments) -> None:
    """Do post extraction story processing work by calling update_story_sentences_and_language()."""
    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    log.debug("Updating sentences and language for story {}...".format(stories_id))
    update_story_sentences_and_language(db=db, story=story, extractor_args=extractor_args)

    if not extractor_args.no_tag_extractor_version():
        log.debug("Updating extractor version tag for story {}...".format(stories_id))
        update_extractor_version_tag(db=db, story=story)

    # Extract -> CLIFF -> NYTLabels -> mark_as_processed() chain
    cliff = CLIFFAnnotator()
    if cliff.annotator_is_enabled() and cliff.story_is_annotatable(db=db, stories_id=stories_id):
        # If CLIFF annotator is enabled, cliff/update_story_tags job will check whether NYTLabels annotator is enabled,
        # and if it is, will pass the story further to NYTLabels. NYTLabels, in turn, will mark the story as processed.
        log.debug("Adding story {} to CLIFF annotation queue...".format(stories_id))
        CLIFFFetchAnnotationJob.add_to_queue(stories_id=stories_id)

    else:
        log.debug("Won't add {} to CLIFF annotation queue because it's not annotatable with CLIFF".format(stories_id))

        nytlabels = NYTLabelsAnnotator()
        if nytlabels.annotator_is_enabled() and nytlabels.story_is_annotatable(db=db, stories_id=stories_id):
            # If CLIFF annotator is disabled, pass the story to NYTLabels annotator which, if run, will mark the story
            # as processed
            log.debug("Adding story {} to NYTLabels annotation queue...".format(stories_id))
            NYTLabelsFetchAnnotationJob.add_to_queue(stories_id=stories_id)

        else:
            log.debug("Won't add {} to NYTLabels annotation queue because it's not annotatable with NYTLabels".format(
                stories_id
            ))

            # If neither of the annotators are enabled, mark the story as processed ourselves
            log.debug("Marking the story as processed...")
            if not mark_as_processed(db=db, stories_id=stories_id):
                raise McProcessExtractedStoryException("Unable to mark story ID {} as processed".format(stories_id))
Example #12
0
def extract_and_create_download_text(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments) -> dict:
    """Extract the download and create a download_text from the extracted download."""
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    log.debug("Extracting download {}...".format(downloads_id))
    extraction_result = extract(db=db, download=download, extractor_args=extractor_args)
    log.debug("Done extracting download {}.".format(downloads_id))

    download_text = None
    if extractor_args.use_existing():
        log.debug("Fetching download text for download {}...".format(downloads_id))
        download_text = db.query("""
            SELECT *
            FROM download_texts
            WHERE downloads_id = %(downloads_id)s
        """, {'downloads_id': downloads_id}).hash()

    if download_text is None:
        log.debug("Creating download text for download {}...".format(downloads_id))
        download_text = create(db=db, download=download, extract=extraction_result)

    return download_text
Example #13
0
def update_story_sentences_and_language(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Update story vectors for the given story, updating "story_sentences".

    If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new
    ones (useful for optimization if you are very sure no story vectors exist for this story).

    If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are
    reprocessing a small set of stories).
    """

    story = decode_object_from_bytes_if_needed(story)

    use_transaction = not db.in_transaction()

    if use_transaction:
        db.begin()

    stories_id = story['stories_id']

    if not extractor_args.no_delete():
        _delete_story_sentences(db=db, story=story)

    story_text = story.get('story_text', None)
    if not story_text:
        story_text = get_text_for_word_counts(db=db, story=story)
        if not story_text:
            story_text = ''

    story_lang = language_code_for_text(text=story_text)

    sentences = _get_sentences_from_story_text(story_text=story_text,
                                               story_lang=story_lang)

    if (not story.get('language',
                      None)) or story.get('language', None) != story_lang:
        db.query(
            """
            UPDATE stories
            SET language = %(story_lang)s
            WHERE stories_id = %(stories_id)s
        """, {
                'stories_id': stories_id,
                'story_lang': story_lang
            })
        story['language'] = story_lang

    if sentences is None:
        raise McUpdateStorySentencesAndLanguageException(
            "Sentences for story {} are undefined.".format(stories_id))

    if len(sentences) == 0:
        log.debug("Story {} doesn't have any sentences.".format(stories_id))
        return

    sentences = _clean_sentences(sentences)

    _insert_story_sentences(
        db=db,
        story=story,
        sentences=sentences,
        no_dedup_sentences=extractor_args.no_dedup_sentences(),
    )

    story['ap_syndicated'] = _update_ap_syndicated(
        db=db,
        stories_id=stories_id,
        story_title=story['title'],
        story_text=story_text,
        story_language=story_lang,
    )

    if use_transaction:
        db.commit()