Esempio n. 1
0
def extract_text_content_and_links_from_articletext(main_content, has_intro=True):
    article_text = main_content

    in_text_tagged_urls = []
    all_cleaned_paragraphs = []
    all_rough_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, bs.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        # we first need to avoid treating embedded tweets as text
        for paragraph in text_fragments:
            if isinstance(paragraph, bs.NavigableString):
                all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                all_rough_paragraphs.append(paragraph)

            else:
                if not paragraph.find("blockquote", {"class": "twitter-tweet"}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                    )

        # extracting plaintext links
        for paragraph in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(
                remove_text_formatting_and_links_from_fragments(paragraph)
            )
            for url in plaintext_urls:
                tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                tags.update(["plaintext", "in text"])
                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_cleaned_paragraphs = []

    return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
Esempio n. 2
0
def extract_text_content_and_links(main_content):
    article_text = main_content.find('div', {'id': 'articleText'})

    in_text_tagged_urls = []
    all_rough_paragraphs = []
    all_clean_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, BeautifulSoup.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        for paragraph in text_fragments:
            if isinstance(paragraph, BeautifulSoup.NavigableString):
                all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                all_rough_paragraphs.append(paragraph)
            else:
                if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES))

        for p in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
                tags.update(['plaintext', 'in text'])

                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_clean_paragraphs = []

    return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets