Python html_strip Examples, mediawords.util.parse_html.html_strip Python Examples

Example #1

0

Show file

File: extract.py Project: berkmancenter/mediacloud

def get_text_for_word_counts(db: DatabaseHandler, story: dict) -> str:
    """Get story title + description + body concatenated into a single string.

    This is what is used to fetch text to generate story_sentences, which eventually get imported into Solr.

    If the text of the story ends up being shorter than the description, return the title + description instead of the
    story text (some times the extractor falls down and we end up with better data just using the title + description.
    """
    story = decode_object_from_bytes_if_needed(story)

    if story['full_text_rss']:
        story_text = __get_full_text_from_rss(story)
    else:
        story_text = _get_extracted_text(db=db, story=story)

    story_description = story.get('description', '')

    if story_text is None:
        story_text = ''
    if story_description is None:
        story_description = ''

    if len(story_text) == 0 or len(story_text) < len(story_description):
        story_text = html_strip(story['title']).strip()

        if story_description:

            story_text += "\n\n"
            story_text += html_strip(story_description).strip()

    return story_text

Example #2

0

Show file

File: extract.py Project: berkmancenter/mediacloud

def __get_full_text_from_rss(story: dict) -> str:
    story = decode_object_from_bytes_if_needed(story)

    story_title = story.get('title', '')
    story_description = story.get('description', '')

    return "\n\n".join([html_strip(story_title), html_strip(story_description)])

Example #3

0

Show file

File: text.py Project: vishalbelsare/mediacloud

def get_text_for_word_counts(db: DatabaseHandler, story: dict) -> str:
    """Get story title + description + body concatenated into a single string.

    This is what is used to fetch text to generate story_sentences, which eventually get imported into Solr.

    If the text of the story ends up being shorter than the description, return the title + description instead of the
    story text (some times the extractor falls down and we end up with better data just using the title + description.
    """
    story = decode_object_from_bytes_if_needed(story)

    if story['full_text_rss']:
        story_text = __get_full_text_from_rss(story)
    else:
        story_text = _get_extracted_text(db=db, story=story)

    story_description = story.get('description', '')

    if story_text is None:
        story_text = ''
    if story_description is None:
        story_description = ''

    if len(story_text) == 0 or len(story_text) < len(story_description):
        story_text = html_strip(story['title']).strip()

        if story_description:
            story_text += "\n\n"
            story_text += html_strip(story_description).strip()

    return story_text

Example #4

0

Show file

File: extract.py Project: zhanglipku/mediacloud

def __get_full_text_from_rss(story: dict) -> str:
    story = decode_object_from_bytes_if_needed(story)

    story_title = story.get('title', '')
    story_description = story.get('description', '')

    return "\n\n".join([html_strip(story_title), html_strip(story_description)])

Example #5

0

Show file

    async def identify_story_bcp47_language_code(
            self, stories_id: int) -> Optional[str]:
        log.info(f"Identifying story language for story {stories_id}...")

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)
        if not story:
            raise McPermanentError(f"Story {stories_id} was not found.")

        # Podcast episodes typically come with title and description set so try guessing from that
        story_title = story['title']
        story_description = html_strip(story['description'])
        sample_text = f"{story_title}\n{story_description}"

        bcp_47_language_code = None
        if identification_would_be_reliable(text=sample_text):
            iso_639_1_language_code = language_code_for_text(text=sample_text)

            # Convert to BCP 47 identifier
            bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
                iso_639_1_code=iso_639_1_language_code,
                url_hint=story['url'],
            )

        log.info(
            f"Language code for story {stories_id} is {bcp_47_language_code}")

        return bcp_47_language_code

Example #6

0

Show file

File: ap.py Project: vishalbelsare/mediacloud

def import_archive_file(db: DatabaseHandler, file: str) -> None:
    """Import ap story described by xml in file into database."""
    log.debug("import ap file: %s" % file)

    with open(file) as fd:
        xml = xmltodict.parse(fd.read())

    entry = xml['sATOM']['entry']
    body = entry['content']['nitf']['body']

    story = dict()
    story['title'] = body['body.head']['hedline']['hl1']['#text']
    story['publish_date'] = entry['published']
    story['description'] = body['body.head'].get('abstract', story['title'])

    story['guid'] = entry['id'].replace('urn:publicid:ap.org:', '')
    story['url'] = entry['link'][
        '@href'] if 'link' in entry else 'http://apnews.com/invalid/%s' % story[
            'guid']

    # make sure body.content is the only child of body; otherwise the unparse command below will fail
    body_content = body.get('body.content', {})
    content_block = body_content['block'] if body_content is not None else {}

    content = xmltodict.unparse({'html': {'content': content_block}})

    story['text'] = html_strip(content)

    _import_ap_story(db, story)

Example #7

0

Show file

def _call_extractor_on_html(content: str) -> dict:
    """Call extractor on the content."""
    content = decode_object_from_bytes_if_needed(content)

    extracted_html = extract_article_from_html(content)
    extracted_text = html_strip(extracted_html)

    return {'extracted_html': extracted_html, 'extracted_text': extracted_text}

Example #8

0

Show file

File: downloads.py Project: berkmancenter/mediacloud

def _call_extractor_on_html(content: str) -> dict:
    """Call extractor on the content."""
    content = decode_object_from_bytes_if_needed(content)

    extracted_html = extract_article_from_html(content)
    extracted_text = html_strip(extracted_html)

    return {'extracted_html': extracted_html, 'extracted_text': extracted_text}

Example #9

0

Show file

File: extract.py Project: tidehc/mediacloud

def combine_story_title_description_text(story_title: Optional[str],
                                         story_description: Optional[str],
                                         download_texts: List[str]) -> str:
    """Get the combined story title, story description, and download text of the story in a consistent way."""
    story_title = decode_object_from_bytes_if_needed(story_title)
    story_description = decode_object_from_bytes_if_needed(story_description)
    download_texts = decode_object_from_bytes_if_needed(download_texts)

    if story_title is None:
        story_title = ''

    if story_description is None:
        story_description = ''

    return "\n***\n\n".join(
        [html_strip(story_title),
         html_strip(story_description)] + download_texts)

Example #10

0

Show file

def _call_extractor_on_html(content: str) -> dict:
    """Call extractor on the content."""
    content = decode_object_from_bytes_if_needed(content)

    extractor_results = extract_article_html_from_page_html(content)

    extracted_html = extractor_results['extracted_html']
    extractor_version = extractor_results['extractor_version']
    extracted_text = html_strip(extracted_html)

    return {
        'extracted_html': extracted_html,
        'extracted_text': extracted_text,
        'extractor_version': extractor_version,
    }

Example #11

0

Show file

def test_html_strip() -> None:
    """Test html_strip()."""
    assert html_strip("<strong>Hellonot </strong>") == "Hellonot"

    assert html_strip("<script>delete</script><p>body</p>") == "body."

    assert html_strip("<title>delete</title><p>content</p>") == "content."

    assert html_strip("<title>delete</title><p>content</p>", include_title=True) == "delete. content."

    assert html_strip("<p>foo\xAD</p>") == "foo."

    assert html_strip("&amp;&quot;") == '&"'

    html_path = mediawords.util.paths.mc_root_path() + '/mediacloud/test-data/html/strip.html'
    with open(html_path, 'r', encoding='utf8') as fh:
        html = fh.read()

    # just santiy test here to make sure there are no errors without having to use a fixture
    got_text = html_strip(html)
    assert (len(got_text) > 0.05 * len(html))
    assert '<' not in got_text

Example #12

0

Show file

File: fetch_and_store.py Project: sagar-joshi/backend

def fetch_and_store_episode(
        db: DatabaseHandler,
        stories_id: int,
        config: Optional[PodcastFetchEpisodeConfig] = None) -> None:
    """
    Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB.

    1) Determines the episode's likely language by looking into its title and description, converts the language code to
       BCP 47;
    1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most;
    2) Fetches the chosen enclosure;
    3) Transcodes the file (if needed) by:
        a) converting it to an audio format that the Speech API can support, and / or
        b) discarding video stream from the media file, and / or
        c) discarding other audio streams from the media file;
    5) Reads the various parameters, e.g. sample rate, of the episode audio file;
    4) Uploads the episode audio file to Google Cloud Storage;
    5) Adds a row to "podcast_episodes".

    Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller.

    :param db: Database handler.
    :param stories_id: Story ID for the story to operate on.
    :param config: (optional) Podcast fetcher configuration object (useful for testing).
    """

    if not config:
        config = PodcastFetchEpisodeConfig()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McStoryNotFoundException(f"Story {stories_id} was not found.")

    # Try to determine language of the story
    story_title = story['title']
    story_description = html_strip(story['description'])
    sample_text = f"{story_title}\n{story_description}"

    iso_639_1_language_code = None
    if identification_would_be_reliable(text=sample_text):
        iso_639_1_language_code = language_code_for_text(text=sample_text)

    if not iso_639_1_language_code:
        iso_639_1_language_code = 'en'

    # Convert to BCP 47 identifier
    bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
        iso_639_1_code=iso_639_1_language_code,
        url_hint=story['url'],
    )

    # Find the enclosure that might work the best
    best_enclosure = podcast_viable_enclosure_for_story(db=db,
                                                        stories_id=stories_id)
    if not best_enclosure:
        raise McPodcastNoViableStoryEnclosuresException(
            f"There were no viable enclosures found for story {stories_id}")

    if best_enclosure.length:
        if best_enclosure.length > MAX_ENCLOSURE_SIZE:
            raise McPodcastEnclosureTooBigException(
                f"Chosen enclosure {best_enclosure} is too big.")

    try:
        temp_dir = tempfile.mkdtemp('fetch_and_store')
    except Exception as ex:
        raise McPodcastFileStoreFailureException(
            f"Unable to create temporary directory: {ex}")

    # Fetch enclosure
    input_filename = 'input_file'
    input_file_path = os.path.join(temp_dir, input_filename)
    log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...")
    fetch_big_file(url=best_enclosure.url,
                   dest_file=input_file_path,
                   max_size=MAX_ENCLOSURE_SIZE)
    log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}")

    if os.stat(input_file_path).st_size == 0:
        # Might happen with misconfigured webservers
        raise McPodcastFileFetchFailureException(
            f"Fetched file {input_file_path} is empty.")

    # Transcode if needed
    input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir,
                                             filename=input_filename)
    transcoded_file_obj = transcode_media_file_if_needed(
        input_media_file=input_file_obj)

    # Unset the variable so that we don't accidentally use it later
    del input_filename, temp_dir

    if input_file_obj != transcoded_file_obj:
        # Function did some transcoding and stored everything in yet another file

        # Remove the input file
        _cleanup_temp_dir(temp=input_file_obj)

        # Consider the transcoded file the new input file
        input_file_obj = transcoded_file_obj

    # (Re)read the properties of either the original or the transcoded file
    media_info = media_file_info(media_file_path=input_file_obj.temp_full_path)
    best_audio_stream = media_info.best_supported_audio_stream()

    # Store input file to GCS
    try:
        gcs = GCSStore(config=config)
        gcs_uri = gcs.store_object(
            local_file_path=input_file_obj.temp_full_path,
            object_id=str(stories_id),
            mime_type=best_audio_stream.audio_codec_class.mime_type(),
        )

    except Exception as ex:

        log.error(
            f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}"
        )

        # Clean up, then raise further
        _cleanup_temp_dir(temp=input_file_obj)

        raise ex

    # Clean up the locally stored file as we don't need it anymore
    _cleanup_temp_dir(temp=input_file_obj)

    # Insert everything to the database
    try:
        db.query(
            """
            INSERT INTO podcast_episodes (
                stories_id,
                story_enclosures_id,
                gcs_uri,
                duration,
                codec,
                sample_rate,
                bcp47_language_code
            ) VALUES (
                %(stories_id)s,
                %(story_enclosures_id)s,
                %(gcs_uri)s,
                %(duration)s,
                %(codec)s,
                %(sample_rate)s,
                %(bcp47_language_code)s            
            ) ON CONFLICT (stories_id) DO UPDATE SET
                story_enclosures_id = %(story_enclosures_id)s,
                gcs_uri = %(gcs_uri)s,
                duration = %(duration)s,
                codec = %(codec)s,
                sample_rate = %(sample_rate)s,
                bcp47_language_code = %(bcp47_language_code)s
        """, {
                'stories_id':
                stories_id,
                'story_enclosures_id':
                best_enclosure.story_enclosures_id,
                'gcs_uri':
                gcs_uri,
                'duration':
                best_audio_stream.duration,
                'codec':
                best_audio_stream.audio_codec_class.postgresql_enum_value(),
                'sample_rate':
                best_audio_stream.sample_rate,
                'bcp47_language_code':
                bcp_47_language_code,
            })

    except Exception as ex_db:

        # Try to delete object on GCS first
        try:
            gcs.delete_object(object_id=str(stories_id))
        except Exception as ex_gcs:
            # We should be able to delete it as we've just uploaded it
            raise McPodcastGCSStoreFailureException((
                f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; "
                f"database insert exception: {ex_db}; "
                f"GCS exception: {ex_gcs}"))

        raise McPodcastPostgreSQLException(
            f"Failed inserting episode for story {stories_id}: {ex_db}")

Example #13

0

Show file

File: create.py Project: timyrankinen/mediacloud

def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story