def get_news_text_from_html(data): """ Given a string of data, locate the span that has the id "textstire" and that ends in </span>. It needs to support nested spans. Arguments: data: A string with the entire html page. Returns: A string with just the content text. """ # From the data, get just the content. I don't quite understand why this # didn't work with a regular expression. data = replace_circ_diacritics(data) try: soup = BeautifulSoup(data, "lxml") except HTMLParseError: return 'error' tag = soup.find('div', id="article_text_content") desc = soup.find('meta', {'name': 'description'}) if desc is None: return None content = desc['content'] return content.encode('UTF-8') + ' ' + str(tag)
def get_news_text_from_html(data): """ Given a string of data, locate the span that has the id "textstire" and that ends in </span>. It needs to support nested spans. Arguments: data: A string with the entire html page. Returns: A string with just the content text. """ # From the data, get just the content. I don't quite understand why this # didn't work with a regular expression. data = replace_circ_diacritics(data) try: soup = BeautifulSoup(data) except HTMLParseError: return 'error' tag = soup.find('div', id="article_text_content") desc = soup.find('meta', {'name': 'description'}) if desc is None: return None content = desc['content'] return content.encode('UTF-8') + ' ' + str(tag)
def get_news_text_from_html(data): """ Given a string of data, locate the content. Arguments: data: A string with the entire html page. Returns: A string with just the content text. """ # From the data, get just the content. I don't quite understand why this # didn't work with a regular expression. data = replace_circ_diacritics(data) data = replace_html_comments(data) try: soup = BeautifulSoup(data) except HTMLParseError: return 'error' tag = soup.find('div', {'id': 'articleContent'}) if tag is None: return "error: article not found" script = tag.findNext('script', { 'type': 'text/javascript'}) if script is not None: script.extract() #tag.findNext('div', {'class': 'tool_back'}).extract() #links = tag.findNext('div', {'class': 'links'}) #if links is not None: # links.extract() return str(tag)
def get_news_text_from_html(data): """ Given a string of data, locate the content. Arguments: data: A string with the entire html page. Returns: A string with just the content text. """ # From the data, get just the content. I don't quite understand why this # didn't work with a regular expression. data = replace_circ_diacritics(data) data = replace_html_comments(data) try: soup = BeautifulSoup(data, "lxml") except HTMLParseError: return 'error' tag = soup.find('div', {'id': 'articleContent'}) if tag is None: return "error: article not found" script = tag.findNext('script', {'type': 'text/javascript'}) if script is not None: script.extract() #tag.findNext('div', {'class': 'tool_back'}).extract() #links = tag.findNext('div', {'class': 'links'}) #if links is not None: # links.extract() return str(tag)