def extract_text_content_and_links(soup) : tagged_urls = list() inline_links = [] text = list() article_body = soup.find(attrs = {"class" : "article-body"}) text_fragments = article_body.find_all("p") other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"}) all_fragments = text_fragments + other_fragments if all_fragments: for paragraph in text_fragments: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: text = u"" for p in all_fragments: link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return text, tagged_urls
def extract_links_from_text_hxs(hxs): tagged_urls = list() # intext urls: take all the <a>, except what might be inside a rendered tweet intext_link_hxs = hxs.select(".//a") for link_hxs in intext_link_hxs: title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(make_tagged_url(url, title, tags)) #plaintext text urls raw_content = hxs.select(".//p/text()").extract() if raw_content: for paragraph in raw_content: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) #embedded objects iframe_sources = hxs.select(".//iframe/@src").extract() for url in iframe_sources: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags = tags.union(['in text', 'embedded', 'iframe']) tagged_urls.append(make_tagged_url(url, url, tags)) return tagged_urls
def extract_intro(soup): intro_box = soup.find(attrs={"class": "intro"}) tagged_urls = [] if intro_box: intro_fragments = intro_box.find_all('b') intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments) inline_links = intro_box.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: intro = "" return intro, tagged_urls
def extract_text_content_and_links_from_articletext(main_content, has_intro=True): article_text = main_content in_text_tagged_urls = [] all_cleaned_paragraphs = [] all_rough_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, bs.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: # we first need to avoid treating embedded tweets as text for paragraph in text_fragments: if isinstance(paragraph, bs.NavigableString): all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: if not paragraph.find("blockquote", {"class": "twitter-tweet"}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES) ) # extracting plaintext links for paragraph in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text( remove_text_formatting_and_links_from_fragments(paragraph) ) for url in plaintext_urls: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) tags.update(["plaintext", "in text"]) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_cleaned_paragraphs = [] return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def extract_text_and_links_from_paragraph(paragraph_hxs): def separate_img_and_text_links(links): img_links = [l for l in links if l.select("./img")] text_links = [l for l in links if l not in img_links] return [extract_title_and_url(link) for link in text_links], [extract_img_link_info(link) for link in img_links] links = paragraph_hxs.select(".//a") titles_and_urls, img_targets_and_urls = separate_img_and_text_links(links) tagged_urls = list() for title, url in titles_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text']) if title == constants.GHOST_LINK_TITLE: tags.update([constants.GHOST_LINK_TAG]) tagged_urls.append(make_tagged_url(url, title, tags)) for img_target, url in img_targets_and_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['in text', 'embedded image']) tagged_urls.append(make_tagged_url(url, img_target, tags)) # plaintext urls text_fragments = paragraph_hxs.select("./text()").extract() if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) for paragraph in text_fragments: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" # iframes iframes = paragraph_hxs.select(".//iframe") for iframe in iframes: target_url, tags = extract_and_tag_iframe_source(iframe) tagged_urls.append(make_tagged_url(target_url, "__EMBEDDED_IFRAME__", tags)) return text, tagged_urls
def extract_links_from_intro(fragment): tagged_urls = list() inline_links = fragment.find_all('a') titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(fragment)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) return tagged_urls
def extract_text_content_and_links(main_content): article_text = main_content.find('div', {'id': 'articleText'}) in_text_tagged_urls = [] all_rough_paragraphs = [] all_clean_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, BeautifulSoup.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: for paragraph in text_fragments: if isinstance(paragraph, BeautifulSoup.NavigableString): all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: if not paragraph.find('blockquote', {'class': 'twitter-tweet'}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)) for p in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) for url in plaintext_urls: tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) tags.update(['plaintext', 'in text']) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_clean_paragraphs = [] return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets
def extract_text_and_links_from_paragraph(paragraph): def extract_url_and_title(link): if isinstance(link.contents[0], bs.Tag): if link.contents[0].name == 'img': img_target = link.contents[0].get('src') return link.get('href'), '(img){0}'.format(img_target) else: title = remove_text_formatting_markup_from_fragments(link.contents) return link.get('href'), title else: return link.get('href'), remove_text_formatting_markup_from_fragments(link.contents) # Why do we filter on link.contents? Because sometimes there # are <a id="more"></a> links which point to nothing. # Awesome. urls_and_titles = [extract_url_and_title(link) for link in paragraph.findAll('a', recursive=False) if link.contents] tagged_urls = list() for url, title in urls_and_titles: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['in text']) tagged_urls.append(make_tagged_url(url, title, tags)) text_fragments = paragraph.contents if text_fragments: text = u"".join(remove_text_formatting_markup_from_fragments(text_fragments)) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(text_fragments)) for url in plaintext_urls: tags = classify_and_tag(url, SUDPRESSE_OWN_NETLOC, SUDPRESSE_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) else: text = u"" return text, tagged_urls
def extract_text_content(story): """ Finds the story's body, cleans up the text to remove all html formatting. Returns a list of strings, one per found paragraph, and all the plaintext urls, as TaggedURLs """ story = story.find('div', {'id': 'story_body'}) paragraphs = story.findAll('p', recursive=False) tagged_urls = list() # extract regular, in text links inline_links = list() plaintext_urls = list() text = list() if paragraphs: for paragraph in paragraphs: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) links = paragraph.findAll('a', recursive=True) inline_links.extend(links) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links if not i.find('img')] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) else: text = u"" return text, tagged_urls
def extract_text_content_and_links(soup): article_text = [] inline_links = [] plaintext_urls = [] content_box = soup.find(attrs={"id": "detail_content"}) text = content_box.find_all(attrs={"class": "clear"}) for fragment in text: paragraphs = fragment.find_all("p", recursive=False) for p in paragraphs: clean_text = remove_text_formatting_markup_from_fragments(p, strip_chars="\n") if clean_text: article_text.append(clean_text) found_plaintext_links = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) plaintext_urls.extend(found_plaintext_links) link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] tagged_urls = list() for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in text') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) return article_text, tagged_urls