def extract_article(url, html): article = Article(url, language='zh') # Special handling: make sure </html> is at the end of the document html = html_body_pat.sub(' ', html) + '</body></html>' article.download(input_html=html) article.parse() top_node = article.top_node title = article.title meta_title = None h1_title = None if len(article.titles) > 1: meta_title = article.titles[0] h1_title = article.titles[1] if top_node is not None: # Select best title according to content content = ''.join(top_node.itertext()) score1 = SequenceMatcher(None, article.titles[0], content).ratio() score2 = SequenceMatcher(None, article.titles[1], content).ratio() if score1 >= score2: title = article.title else: title = article.titles[1] title = strip_site_names_from_title(title).strip() if meta_title: meta_title = strip_site_names_from_title(meta_title).strip() if h1_title: h1_title = strip_site_names_from_title(h1_title).strip() if title and len(title) > 6 and len(article.top_nodes) > 1: # Select best content according to title content_map = {} for node in article.top_nodes: content = escape_pat.sub(' ', ''.join(node.itertext())) content = blank_pat.sub( '', content[get_content_index(title, content):]) if content not in content_map: content_map[content] = node bests = process.extractBests(title, content_map.keys()) if content_map[bests[0][0]] != article.top_node: top_score = 0 top_len = 0 for item in bests: if content_map[item[0]] == article.top_node: top_score = item[1] top_len = len(item[0]) if bests[0][1] > top_score * 2 and (len(bests[0]) >= 50 or len(bests[0]) >= top_len / 2): top_node = content_map[bests[0][0]] if len(article.top_nodes) > 1: # Promote to parent if similar to siblings top_text = blank_pat.sub('', ''.join(top_node.itertext())) for node in article.top_nodes: node_text = None if node == top_node.getparent(): node_text = blank_pat.sub('', ''.join(node.itertext())) node_text = node_text.replace(top_text, '') elif node.getparent() == top_node.getparent(): node_text = blank_pat.sub('', ''.join(node.itertext())) if node_text: s = SequenceMatcher(None, top_text, node_text) if s.ratio() >= 0.15: top_node = top_node.getparent() break if top_node is None: # Try to identify top_node # First search for title in html_text tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html))) #parser = etree.HTMLParser() #tree = etree.parse(StringIO(article.html), parser) elem = tree.getroot() while True: children = list(elem) node = None for i in range(len(children) - 1, -1, -1): child = children[i] try: if ''.join(child.itertext()).find(title) >= 0: node = child break except: pass if node is not None: elem = node else: break # elem is the smallest element containing the title length = len(''.join(elem.itertext()).strip()) while True: node = elem.getparent() if node is not None: parent_length = len(''.join(node.itertext()).strip()) if parent_length >= length * 3: top_node = node break else: top_node = elem break elem = node content = escape_pat.sub(' ', article.format_top_node(top_node, title)) # Try to identify date index = get_content_index(title, content) article_title = title article_text = content[index:].strip() article_content = article_text if not article_text: at = content[:index].strip() if at: article_text = at else: article_text = title article_title = '' tree = cleaner.clean_html(lxml.html.parse(StringIO(article.html))) html_text = ' '.join(tree.getroot().itertext()) html_text = remove_blanks(html_text) filtered_article_title = remove_blanks(article_title) filtered_article_text = remove_blanks(article_text) title_start_idx = -1 title_end_idx = -1 content_idx = -1 res_date = None if filtered_article_text: x = find_near_matches(filtered_article_text, html_text, max_l_dist=2) if x: content_idx = x[0].start if filtered_article_title: y = find_near_matches(filtered_article_title, html_text[:content_idx], max_l_dist=2) if y: title_start_idx = y[-1].start title_end_idx = y[-1].end else: title_lines = article_title.split('\n') article_lines = article_text.split('\n') end = len(html_text) nxt = end for i in range(len(article_lines) - 1, -1, -1): ln = remove_blanks(article_lines[i]) if len(ln) >= 10: x = find_near_matches(ln, html_text[:end], max_l_dist=2) if x: end = x[-1].start nxt = x[-1].end else: break content_idx = end for i in range(len(title_lines) - 1, -1, -1): ln = remove_blanks(title_lines[i]) if len(ln) >= 10: x = find_near_matches(ln, html_text[:end], max_l_dist=2) if x: end = x[-1].start nxt = x[-1].end else: break title_start_idx = end title_end_idx = nxt if title_start_idx >= 0: text_between = html_text[title_end_idx:content_idx] dates_between = list(article_date_pat.finditer(text_between)) res_date, res_date_verified, distance = select_date_from_candidates( dates_between, -1, text_between) if not (res_date and res_date_verified): if title_start_idx >= 0: search_end = title_start_idx else: search_end = content_idx text_before = html_text[:search_end] dates_before = list(article_date_pat.finditer(text_before)) res_before, res_before_verified, dist_before = select_date_from_candidates( dates_before, -1, text_before) text_after = html_text[content_idx:] dates_after = list(article_date_pat.finditer(text_after)) res_after, res_after_verified, dist_after = select_date_from_candidates( dates_after, 1, text_after) if res_before: if res_after: if res_before_verified and not res_after_verified: res_date = res_before res_date_verified = True elif res_after_verified and not res_before_verified: res_date = res_after res_date_verified = True elif dist_before <= dist_after: if res_before_verified or not res_date: res_date = res_before res_date_verified = res_before_verified else: if res_after_verified or not res_date: res_date = res_after res_date_verified = res_after_verified else: if res_before_verified or not res_date: res_date = res_before res_date_verified = res_before_verified elif res_after: if res_after_verified or not res_date: res_date = res_after res_date_verified = res_after_verified extracted_date = extractArticlePublishedDate(url, html=article.html) # Date from meta tags if article.publish_date: if not (res_date and res_date.date() == article.publish_date.date()): res_date = article.publish_date # Date from URL elif extracted_date and not (res_date and res_date_verified): res_date = extracted_date return (title, meta_title, h1_title, article_content, res_date, extracted_date, article.publish_date)