Python remove_text_formatting_and_links_from_fragmentsの例、parser_tools.utils.remove_text_formatting_and_links_from_fragments Pythonの例

コード例 #1

0

ファイルを表示

ファイル: lesoir_new.py プロジェクト: sevas/csxj-crawler

def extract_text_content_and_links(soup) :
    tagged_urls = list()
    inline_links = []
    text = list()

    article_body = soup.find(attrs = {"class" : "article-body"})
    text_fragments = article_body.find_all("p")
    other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"})
    all_fragments = text_fragments + other_fragments

    if all_fragments:
        for paragraph in text_fragments:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    else:
        text = u""

    for p in all_fragments:
        link = p.find_all("a")
        inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return text, tagged_urls

コード例 #2

0

ファイルを表示

ファイル: lavenir.py プロジェクト: sevas/csxj-crawler

def extract_links_from_text_hxs(hxs):
    tagged_urls = list()
    # intext urls: take all the <a>, except what might be inside a rendered tweet

    intext_link_hxs = hxs.select(".//a")
    for link_hxs in intext_link_hxs:
        title, url = extract_title_and_url(link_hxs)
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags.add('in text')
        tagged_urls.append(make_tagged_url(url, title, tags))

    #plaintext text urls
    raw_content = hxs.select(".//p/text()").extract()

    if raw_content:
        for paragraph in raw_content:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

    #embedded objects
    iframe_sources = hxs.select(".//iframe/@src").extract()
    for url in iframe_sources:
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags = tags.union(['in text', 'embedded', 'iframe'])
        tagged_urls.append(make_tagged_url(url, url, tags))

    return tagged_urls

コード例 #3

0

ファイルを表示

ファイル: septsursept.py プロジェクト: sevas/csxj-crawler

def extract_intro(soup):
    intro_box = soup.find(attrs={"class": "intro"})
    tagged_urls = []

    if intro_box:
        intro_fragments = intro_box.find_all('b')
        intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments)
        inline_links = intro_box.find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
        plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro))

        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('in intro')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        for url in plaintext_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.add('in intro')
            tags.add('plaintext')
            tagged_urls.append(tagging.make_tagged_url(url, url, tags))
    else:
        intro = ""

    return intro, tagged_urls

コード例 #4

0

ファイルを表示

ファイル: dhnet.py プロジェクト: sevas/csxj-crawler

def extract_text_content_and_links_from_articletext(main_content, has_intro=True):
    article_text = main_content

    in_text_tagged_urls = []
    all_cleaned_paragraphs = []
    all_rough_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, bs.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        # we first need to avoid treating embedded tweets as text
        for paragraph in text_fragments:
            if isinstance(paragraph, bs.NavigableString):
                all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                all_rough_paragraphs.append(paragraph)

            else:
                if not paragraph.find("blockquote", {"class": "twitter-tweet"}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                    )

        # extracting plaintext links
        for paragraph in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(
                remove_text_formatting_and_links_from_fragments(paragraph)
            )
            for url in plaintext_urls:
                tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES)
                tags.update(["plaintext", "in text"])
                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_cleaned_paragraphs = []

    return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

コード例 #5

0

ファイルを表示

ファイル: sudinfo.py プロジェクト: sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph_hxs):
    def separate_img_and_text_links(links):
        img_links = [l for l in links if l.select("./img")]
        text_links = [l for l in links if l not in img_links]

        return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links]

    links = paragraph_hxs.select(".//a")

    titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links)

    tagged_urls = list()
    for title, url in titles_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text'])
        if title == constants.GHOST_LINK_TITLE:
            tags.update([constants.GHOST_LINK_TAG])
        tagged_urls.append(make_tagged_url(url, title, tags))

    for img_target, url in img_targets_and_urls:
        tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
        tags.update(['in text', 'embedded image'])
        tagged_urls.append(make_tagged_url(url, img_target, tags))

    # plaintext urls
    text_fragments = paragraph_hxs.select("./text()").extract()
    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        for paragraph in text_fragments:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))

            for url in plaintext_urls:
                tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])

                tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    # iframes
    iframes = paragraph_hxs.select(".//iframe")
    for iframe in iframes:
        target_url, tags = extract_and_tag_iframe_source(iframe)
        tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags))

    return text, tagged_urls

コード例 #6

0

ファイルを表示

ファイル: lesoir_new.py プロジェクト: sevas/csxj-crawler

        def extract_links_from_intro(fragment):
            tagged_urls = list()
            inline_links = fragment.find_all('a')
            titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment))

            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('in intro')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))

            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.add('in intro')
                tags.add('plaintext')
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))
            return tagged_urls

コード例 #7

0

ファイルを表示

ファイル: lalibre.py プロジェクト: sevas/csxj-crawler

def extract_text_content_and_links(main_content):
    article_text = main_content.find('div', {'id': 'articleText'})

    in_text_tagged_urls = []
    all_rough_paragraphs = []
    all_clean_paragraphs = []
    all_plaintext_urls = []
    embedded_tweets = []

    def is_text_content(blob):
        if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS:
            return True
        if isinstance(blob, BeautifulSoup.NavigableString):
            return True
        return False

    text_fragments = [c for c in article_text.contents if is_text_content(c)]

    if text_fragments:
        for paragraph in text_fragments:
            if isinstance(paragraph, BeautifulSoup.NavigableString):
                all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                all_rough_paragraphs.append(paragraph)
            else:
                if not paragraph.find('blockquote', {'class': 'twitter-tweet'}):
                    in_text_links = extract_and_tag_in_text_links(paragraph)
                    in_text_tagged_urls.extend(in_text_links)
                    all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n'))
                    all_rough_paragraphs.append(paragraph)
                else:
                    embedded_tweets.extend(
                        twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES))

        for p in all_rough_paragraphs:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
                tags.update(['plaintext', 'in text'])

                all_plaintext_urls.append(make_tagged_url(url, url, tags))
    else:
        all_clean_paragraphs = []

    return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

コード例 #8

0

ファイルを表示

ファイル: sudpresse.py プロジェクト: sevas/csxj-crawler

def extract_text_and_links_from_paragraph(paragraph):
    def extract_url_and_title(link):
        if isinstance(link.contents[0], bs.Tag):
            if link.contents[0].name == 'img':
                img_target = link.contents[0].get('src')
                return link.get('href'), '(img){0}'.format(img_target)
            else:
                title = remove_text_formatting_markup_from_fragments(link.contents)
                return link.get('href'), title
        else:
            return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents)

    # Why do we filter on link.contents? Because sometimes there
    # are <a id="more"></a> links which point to nothing.
    # Awesome.
    urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents]

    tagged_urls = list()

    for url, title in urls_and_titles:
        tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
        tags.update(['in text'])
        tagged_urls.append(make_tagged_url(url, title, tags))

    text_fragments = paragraph.contents

    if text_fragments:
        text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments))

        plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments))
        for url in plaintext_urls:
            tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES)
            tags.update(['plaintext', 'in text'])

            tagged_urls.append(make_tagged_url(url, url, tags))
    else:
        text = u""

    return text, tagged_urls

コード例 #9

0

ファイルを表示

ファイル: lesoir.py プロジェクト: sevas/csxj-crawler

def extract_text_content(story):
    """
    Finds the story's body, cleans up the text to remove all html formatting.
    Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs
    """
    story = story.find('div', {'id': 'story_body'})
    paragraphs = story.findAll('p', recursive=False)

    tagged_urls = list()

    # extract regular, in text links
    inline_links = list()
    plaintext_urls = list()
    text = list()

    if paragraphs:
        for paragraph in paragraphs:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            links = paragraph.findAll('a', recursive=True)
            inline_links.extend(links)
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
            tags.add('in text')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    else:
        text = u""

    return text, tagged_urls

コード例 #10

0

ファイルを表示

ファイル: septsursept.py プロジェクト: sevas/csxj-crawler

def extract_text_content_and_links(soup):
    article_text = []
    inline_links = []
    plaintext_urls = []

    content_box = soup.find(attrs={"id": "detail_content"})
    text = content_box.find_all(attrs={"class": "clear"})

    for fragment in text:
        paragraphs = fragment.find_all("p", recursive=False)
        for p in paragraphs:
            clean_text = remove_text_formatting_markup_from_fragments(p, strip_chars="\n")
            if clean_text:
                article_text.append(clean_text)

            found_plaintext_links = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p))
            plaintext_urls.extend(found_plaintext_links)
            link = p.find_all("a")
            inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    tagged_urls = list()
    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    for url in plaintext_urls:
        tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
        tags.add('in text')
        tags.add('plaintext')
        tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    return article_text, tagged_urls