Python extract_rendered_tweet Exemples

Langage de programmation: Python

Espace de nommage/Pack: parser_tools.twitter_utils

Méthode/Fonction: extract_rendered_tweet

Exemples au hotexamples.com: 2

Python extract_rendered_tweet - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de parser_tools.twitter_utils.extract_rendered_tweet extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Associées

IPPRequest

amazlet

get_icon

log

get_admin_context

Project

get

XSDataLength

validate

get_chpi_positions

Related in langs

db_manager (PHP)

SubjectConfigurationPeer (PHP)

VideoMediaStorageOptions (C#)

ChartAppearance (C#)

rocksdb_destroy_db (C++)

calcPair (C++)

ExecutableDir (Go)

New (Go)

DataSm (Java)

BpelMapperModel (Java)

Exemple #1

0

Afficher le fichier

Fichier : dhnet.py Projet : sevas/csxj-crawler

def extract_text_content_and_links_from_articletext(main_content, has_intro=True): article_text = main_content in_text_tagged_urls = [] all_cleaned_paragraphs = [] all_rough_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, bs.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, bs.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: # we first need to avoid treating embedded tweets as text for paragraph in text_fragments: if isinstance(paragraph, bs.NavigableString): all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: if not paragraph.find("blockquote", {"class": "twitter-tweet"}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_cleaned_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph)) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, DHNET_NETLOC, DHNET_INTERNAL_SITES) ) # extracting plaintext links for paragraph in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text( remove_text_formatting_and_links_from_fragments(paragraph) ) for url in plaintext_urls: tags = classify_and_tag(url, DHNET_NETLOC, DHNET_INTERNAL_SITES) tags.update(["plaintext", "in text"]) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_cleaned_paragraphs = [] return all_cleaned_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets

Exemple #2

0

Afficher le fichier

Fichier : lalibre.py Projet : sevas/csxj-crawler

def extract_text_content_and_links(main_content): article_text = main_content.find('div', {'id': 'articleText'}) in_text_tagged_urls = [] all_rough_paragraphs = [] all_clean_paragraphs = [] all_plaintext_urls = [] embedded_tweets = [] def is_text_content(blob): if isinstance(blob, BeautifulSoup.Tag) and blob.name in TEXT_MARKUP_TAGS: return True if isinstance(blob, BeautifulSoup.NavigableString): return True return False text_fragments = [c for c in article_text.contents if is_text_content(c)] if text_fragments: for paragraph in text_fragments: if isinstance(paragraph, BeautifulSoup.NavigableString): all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: if not paragraph.find('blockquote', {'class': 'twitter-tweet'}): in_text_links = extract_and_tag_in_text_links(paragraph) in_text_tagged_urls.extend(in_text_links) all_clean_paragraphs.append(remove_text_formatting_markup_from_fragments(paragraph, strip_chars=' \t\r\n')) all_rough_paragraphs.append(paragraph) else: embedded_tweets.extend( twitter_utils.extract_rendered_tweet(paragraph, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)) for p in all_rough_paragraphs: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(p)) for url in plaintext_urls: tags = classify_and_tag(url, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) tags.update(['plaintext', 'in text']) all_plaintext_urls.append(make_tagged_url(url, url, tags)) else: all_clean_paragraphs = [] return all_clean_paragraphs, in_text_tagged_urls + all_plaintext_urls + embedded_tweets