コード例 #1
0
def extract(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments = PyExtractorArguments()) -> dict:
    """Extract the content for the given download.

    Arguments:
    db - db handle
    download - download dict from db
    use_cache - get and set results in extractor cache

    Returns:
    see extract_content() below

    """
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    if extractor_args.use_cache():
        log.debug("Fetching cached extractor results for download {}...".format(downloads_id))
        results = _get_extractor_results_cache(db, download)
        if results is not None:
            return results

    log.debug("Fetching content for download {}...".format(downloads_id))
    content = fetch_content(db, download)

    log.debug("Extracting {} characters of content for download {}...".format(len(content), downloads_id))
    results = extract_content(content)
    log.debug(
        "Done extracting {} characters of content for download {}.".format(len(content), downloads_id))

    if extractor_args.use_cache():
        log.debug("Caching extractor results for download {}...".format(downloads_id))
        _set_extractor_results_cache(db, download, results)

    return results
コード例 #2
0
def extract_and_create_download_text(db: DatabaseHandler, download: dict, extractor_args: PyExtractorArguments) -> dict:
    """Extract the download and create a download_text from the extracted download."""
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    log.debug("Extracting download {}...".format(downloads_id))
    extraction_result = extract(db=db, download=download, extractor_args=extractor_args)
    log.debug("Done extracting download {}.".format(downloads_id))

    download_text = None
    if extractor_args.use_existing():
        log.debug("Fetching download text for download {}...".format(downloads_id))
        download_text = db.query("""
            SELECT *
            FROM download_texts
            WHERE downloads_id = %(downloads_id)s
        """, {'downloads_id': downloads_id}).hash()

    if download_text is None:
        log.debug("Creating download text for download {}...".format(downloads_id))
        download_text = create(db=db, download=download, extract=extraction_result)

    assert 'stories_id' in download, "It is expected to have 'stories_id' set for a download at this point."

    if not extractor_args.no_tag_extractor_version():
        log.debug("Updating extractor version tag for story {}...".format(download['stories_id']))
        update_extractor_version_tag(
            db=db,
            stories_id=download['stories_id'],
            extractor_version=extraction_result['extractor_version'],
        )

    return download_text
コード例 #3
0
    def test_extract(self) -> None:
        """Test extract()."""

        html = '<script>ignore</script><p>foo</p>'
        store_content(self.db, self.test_download, html)
        result = extract(db=self.db, download=self.test_download)

        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'

        store_content(self.db, self.test_download, html)
        extract(
            db=self.db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        store_content(self.db, self.test_download, 'bar')
        result = extract(
            db=self.db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(use_cache=True),
        )
        assert result['extracted_html'].strip(
        ) == '<body id="readabilityBody"><p>foo</p></body>'
        assert result['extracted_text'].strip() == 'foo.'
コード例 #4
0
def process_download_for_extractor(db: DatabaseHandler,
                                   download: dict,
                                   extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url']))

    extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    has_remaining_download = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {'stories_id': stories_id}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db, story=story, extractor_args=extractor_args)
コード例 #5
0
def run_extract_and_vector(stories_id: int,
                           use_cache: bool = False,
                           use_existing: bool = False) -> None:
    """Extract, vector and process a story."""

    global _consecutive_requeues

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    if not stories_id:
        raise McExtractAndVectorException("'stories_id' is not set.")

    db = connect_to_db()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McExtractAndVectorException(
            "Story with ID {} was not found.".format(stories_id))

    if medium_is_locked(db=db, media_id=story['media_id']):
        log.warning(
            "Requeueing job for story {} in locked medium {}...".format(
                stories_id, story['media_id']))
        _consecutive_requeues += 1

        # Prevent spamming these requeue events if the locked media source is the only one in the queue
        if _consecutive_requeues > _SLEEP_AFTER_REQUEUES:
            log.warning(
                "Story extraction job has been requeued more than {} times, waiting before requeueing..."
                .format(_consecutive_requeues))
            time.sleep(1)

        JobBroker(queue_name=QUEUE_NAME).add_to_queue(stories_id=stories_id)

        return

    _consecutive_requeues = 0

    log.info("Extracting story {}...".format(stories_id))

    db.begin()

    try:
        extractor_args = PyExtractorArguments(use_cache=use_cache,
                                              use_existing=use_existing)
        extract_and_process_story(db=db,
                                  story=story,
                                  extractor_args=extractor_args)

    except Exception as ex:
        raise McExtractAndVectorException(
            "Extractor died while extracting story {}: {}".format(
                stories_id, ex))

    db.commit()

    log.info("Done extracting story {}.".format(stories_id))
コード例 #6
0
ファイル: stories.py プロジェクト: vishalbelsare/mediacloud
def _extract_story(db: DatabaseHandler, story: dict) -> None:
    """Process the story through the extractor."""

    if url_has_binary_extension(story['url']):
        return

    if re2.search(r'livejournal.com\/(tag|profile)', story['url'], re2.I):
        return

    extractor_args = PyExtractorArguments(use_cache=True, use_existing=True)
    extract_and_process_story(db=db, story=story, extractor_args=extractor_args)
    def test_extract_and_create_download_text(self):
        download_text = extract_and_create_download_text(
            db=self.db,
            download=self.test_download,
            extractor_args=PyExtractorArguments(),
        )

        assert download_text
        assert download_text['download_text'] == 'foo.'
        assert download_text['downloads_id'] == self.test_download[
            'downloads_id']
コード例 #8
0
ファイル: extract.py プロジェクト: timyrankinen/mediacloud
def extract_and_process_story(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Extract all of the downloads for the given story and then call process_extracted_story()."""

    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    use_transaction = not db.in_transaction()
    if use_transaction:
        db.begin()

    log.debug("Fetching downloads for story {}...".format(stories_id))
    downloads = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND type = 'content'
          AND state = 'success'
        ORDER BY downloads_id ASC
    """, {
            'stories_id': stories_id
        }).hashes()

    # MC_REWRITE_TO_PYTHON: Perlism
    if downloads is None:
        downloads = []

    for download in downloads:
        log.debug("Extracting download {} for story {}...".format(
            download['downloads_id'], stories_id))
        extract_and_create_download_text(db=db,
                                         download=download,
                                         extractor_args=extractor_args)

    log.debug("Processing extracted story {}...".format(stories_id))
    process_extracted_story(db=db, story=story, extractor_args=extractor_args)

    if use_transaction:
        db.commit()
コード例 #9
0
def update_story_sentences_and_language(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Update story vectors for the given story, updating "story_sentences".

    If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new
    ones (useful for optimization if you are very sure no story vectors exist for this story).

    If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are
    reprocessing a small set of stories).
    """

    story = decode_object_from_bytes_if_needed(story)

    use_transaction = not db.in_transaction()

    if use_transaction:
        db.begin()

    stories_id = story['stories_id']

    if not extractor_args.no_delete():
        _delete_story_sentences(db=db, story=story)

    story_text = story.get('story_text', None)
    if not story_text:
        story_text = get_text_for_word_counts(db=db, story=story)
        if not story_text:
            story_text = ''

    story_lang = language_code_for_text(text=story_text)

    sentences = _get_sentences_from_story_text(story_text=story_text,
                                               story_lang=story_lang)

    if (not story.get('language',
                      None)) or story.get('language', None) != story_lang:
        db.query(
            """
            UPDATE stories
            SET language = %(story_lang)s
            WHERE stories_id = %(stories_id)s
        """, {
                'stories_id': stories_id,
                'story_lang': story_lang
            })
        story['language'] = story_lang

    if sentences is None:
        raise McUpdateStorySentencesAndLanguageException(
            "Sentences for story {} are undefined.".format(stories_id))

    if len(sentences) == 0:
        log.debug("Story {} doesn't have any sentences.".format(stories_id))
        return

    sentences = _clean_sentences(sentences)

    _insert_story_sentences(
        db=db,
        story=story,
        sentences=sentences,
        no_dedup_sentences=extractor_args.no_dedup_sentences(),
    )

    story['ap_syndicated'] = _update_ap_syndicated(
        db=db,
        stories_id=stories_id,
        story_title=story['title'],
        story_text=story_text,
        story_language=story_lang,
    )

    if use_transaction:
        db.commit()