def get_excerpt(html, try_meta=False, max_chars=10000): """Extract excerpt from this HTML by finding largest text block try_meta indicates whether to try extracting from meta description tag max_chars is the maximum number of characters for the excerpt """ # try extracting meta description tag excerpt = '' if try_meta: excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content') if not excerpt: # remove these tags and then find biggest text block bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'footer' content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags)) if content: excerpt = max((len(p.strip()), p) for p in content.splitlines())[1] return common.unescape(excerpt.strip())[:max_chars]
def get_excerpt(html, try_meta=False, max_chars=255): """Extract excerpt from this HTML by finding largest text block try_meta indicates whether to try extracting from meta description tag max_chars is the maximum number of characters for the excerpt """ # try extracting meta description tag excerpt = '' if try_meta: excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content') if not excerpt: # remove these tags and then find biggest text block bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags)) if content: excerpt = max((len(p.strip()), p) for p in content.splitlines())[1] return common.unescape(excerpt.strip())[:max_chars]