Python Article.format_top_node Exemples

Langage de programmation: Python

Espace de nommage/Pack: newspaper

Class/Type: Article

Méthode/Fonction: format_top_node

Exemples au hotexamples.com: 1

Python Article.format_top_node - 1 exemples trouvés. Ce sont les exemples réels les mieux notés de newspaper.Article.format_top_node extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Article(30)

nlp(30)

set_html(30)

parse(30)

download(30)

build(20)

html(15)

text(11)

download_state(9)

fetch_images(6)

is_valid_url(6)

publish_date(5)

authors(5)

is_downloaded(5)

title(4)

top_image(3)

article_html(3)

keywords(3)

set_text(2)

images(2)

has_top_image(2)

is_valid_body(2)

summary(1)

summarylen(1)

split(1)

set_top_img_no_check(1)

tag(1)

set_title(1)

tags(1)

textlen(1)

set_meta_data(1)

lower(1)

set_keywords(1)

save(1)

prepareSentenceHighlights(1)

nlpEntropy(1)

meta_data(1)

append(1)

is_video(1)

is_parsed(1)

is_media_news(1)

has_video(1)

get_is_news(1)

format_top_node(1)

category_urls(1)

articles(1)

url(1)

Méthodes fréquemment utilisées

Article (30)

nlp (30)

set_html (30)

parse (30)

download (30)

build (20)

html (15)

text (11)

download_state (9)

fetch_images (6)

Méthodes fréquemment utilisées

is_valid_url (6)

publish_date (5)

authors (5)

is_downloaded (5)

title (4)

top_image (3)

article_html (3)

keywords (3)

set_text (2)

images (2)

has_top_image (2)

is_valid_body (2)

summary (1)

summarylen (1)

split (1)

set_top_img_no_check (1)

tag (1)

set_title (1)

tags (1)

textlen (1)

Méthodes fréquemment utilisées

has_top_image (2)

is_valid_body (2)

summary (1)

summarylen (1)

split (1)

set_top_img_no_check (1)

tag (1)

set_title (1)

tags (1)

textlen (1)

set_meta_data (1)

lower (1)

set_keywords (1)

save (1)

prepareSentenceHighlights (1)

nlpEntropy (1)

meta_data (1)

append (1)

is_video (1)

is_parsed (1)

is_media_news (1)

has_video (1)

get_is_news (1)

format_top_node (1)

category_urls (1)

articles (1)

url (1)

Méthodes fréquemment utilisées

set_meta_data (1)

lower (1)

set_keywords (1)

save (1)

prepareSentenceHighlights (1)

nlpEntropy (1)

meta_data (1)

append (1)

is_video (1)

is_parsed (1)

is_media_news (1)

has_video (1)

get_is_news (1)

format_top_node (1)

category_urls (1)

articles (1)

url (1)

Exemple #1

0

Afficher le fichier

Fichier : article_extractor.py Projet : xiao38245/spiders

def extract_article(url, html): article = Article(url, language='zh') # Special handling: make sure </html> is at the end of the document html = html_body_pat.sub(' ', html) + '</body></html>' article.download(input_html=html) article.parse() top_node = article.top_node title = article.title meta_title = None h1_title = None if len(article.titles) > 1: meta_title = article.titles[0] h1_title = article.titles[1] if top_node is not None: # Select best title according to content content = ''.join(top_node.itertext()) score1 = SequenceMatcher(None, article.titles[0], content).ratio() score2 = SequenceMatcher(None, article.titles[1], content).ratio() if score1 >= score2: title = article.title else: title = article.titles[1] title = strip_site_names_from_title(title).strip() if meta_title: meta_title = strip_site_names_from_title(meta_title).strip() if h1_title: h1_title = strip_site_names_from_title(h1_title).strip() if title and len(title) > 6 and len(article.top_nodes) > 1: # Select best content according to title content_map = {} for node in article.top_nodes: content = escape_pat.sub(' ', ''.join(node.itertext())) content = blank_pat.sub( '', content[get_content_index(title, content):]) if content not in content_map: content_map[content] = node bests = process.extractBests(title, content_map.keys()) if content_map[bests[0][0]] != article.top_node: top_score = 0 top_len = 0 for item in bests: if content_map[item[0]] == article.top_node: top_score = item[1] top_len = len(item[0]) if bests[0][1] > top_score * 2 and (len(bests[0]) >= 50 or len(bests[0]) >= top_len / 2): top_node = content_map[bests[0][0]] if len(article.top_nodes) > 1: # Promote to parent if similar to siblings top_text = blank_pat.sub('', ''.join(top_node.itertext())) for node in article.top_nodes: node_text = None if node == top_node.getparent(): node_text = blank_pat.sub('', ''.join(node.itertext())) node_text = node_text.replace(top_text, '') elif node.getparent() == top_node.getparent(): node_text = blank_pat.sub('', ''.join(node.itertext())) if node_text: s = SequenceMatcher(None, top_text, node_text) if s.ratio() >= 0.15: top_node = top_node.getparent() break if top_node is None: # Try to identify top_node # First search for title in html_text tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html))) #parser = etree.HTMLParser() #tree = etree.parse(StringIO(article.html), parser) elem = tree.getroot() while True: children = list(elem) node = None for i in range(len(children) - 1, -1, -1): child = children[i] try: if ''.join(child.itertext()).find(title) >= 0: node = child break except: pass if node is not None: elem = node else: break # elem is the smallest element containing the title length = len(''.join(elem.itertext()).strip()) while True: node = elem.getparent() if node is not None: parent_length = len(''.join(node.itertext()).strip()) if parent_length >= length * 3: top_node = node break else: top_node = elem break elem = node content = escape_pat.sub(' ', article.format_top_node(top_node, title)) # Try to identify date index = get_content_index(title, content) article_title = title article_text = content[index:].strip() article_content = article_text if not article_text: at = content[:index].strip() if at: article_text = at else: article_text = title article_title = '' tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html))) html_text = ' '.join(tree.getroot().itertext()) html_text = remove_blanks(html_text) filtered_article_title = remove_blanks(article_title) filtered_article_text = remove_blanks(article_text) title_start_idx = -1 title_end_idx = -1 content_idx = -1 res_date = None if filtered_article_text: x = find_near_matches(filtered_article_text, html_text, max_l_dist=2) if x: content_idx = x[0].start if filtered_article_title: y = find_near_matches(filtered_article_title, html_text[:content_idx], max_l_dist=2) if y: title_start_idx = y[-1].start title_end_idx = y[-1].end else: title_lines = article_title.split('\n') article_lines = article_text.split('\n') end = len(html_text) nxt = end for i in range(len(article_lines) - 1, -1, -1): ln = remove_blanks(article_lines[i]) if len(ln) >= 10: x = find_near_matches(ln, html_text[:end], max_l_dist=2) if x: end = x[-1].start nxt = x[-1].end else: break content_idx = end for i in range(len(title_lines) - 1, -1, -1): ln = remove_blanks(title_lines[i]) if len(ln) >= 10: x = find_near_matches(ln, html_text[:end], max_l_dist=2) if x: end = x[-1].start nxt = x[-1].end else: break title_start_idx = end title_end_idx = nxt if title_start_idx >= 0: text_between = html_text[title_end_idx:content_idx] dates_between = list(article_date_pat.finditer(text_between)) res_date, res_date_verified, distance = select_date_from_candidates( dates_between, -1, text_between) if not (res_date and res_date_verified): if title_start_idx >= 0: search_end = title_start_idx else: search_end = content_idx text_before = html_text[:search_end] dates_before = list(article_date_pat.finditer(text_before)) res_before, res_before_verified, dist_before = select_date_from_candidates( dates_before, -1, text_before) text_after = html_text[content_idx:] dates_after = list(article_date_pat.finditer(text_after)) res_after, res_after_verified, dist_after = select_date_from_candidates( dates_after, 1, text_after) if res_before: if res_after: if res_before_verified and not res_after_verified: res_date = res_before res_date_verified = True elif res_after_verified and not res_before_verified: res_date = res_after res_date_verified = True elif dist_before <= dist_after: if res_before_verified or not res_date: res_date = res_before res_date_verified = res_before_verified else: if res_after_verified or not res_date: res_date = res_after res_date_verified = res_after_verified else: if res_before_verified or not res_date: res_date = res_before res_date_verified = res_before_verified elif res_after: if res_after_verified or not res_date: res_date = res_after res_date_verified = res_after_verified extracted_date = extractArticlePublishedDate(url, html=article.html) # Date from meta tags if article.publish_date: if not (res_date and res_date.date() == article.publish_date.date()): res_date = article.publish_date # Date from URL elif extracted_date and not (res_date and res_date_verified): res_date = extracted_date return (title, meta_title, h1_title, article_content, res_date, extracted_date, article.publish_date)