Example #1
0
def get_news_text_from_html(data):
    """ Given a string of data, locate the span that has the id "textstire" and
  that ends in </span>. It needs to support nested spans.

  Arguments:
    data: A string with the entire html page.
  Returns:
    A string with just the content text.
  """
    # From the data, get just the content. I don't quite understand why this
    # didn't work with a regular expression.
    data = replace_circ_diacritics(data)

    try:
        soup = BeautifulSoup(data, "lxml")
    except HTMLParseError:
        return 'error'

    tag = soup.find('div', id="article_text_content")
    desc = soup.find('meta', {'name': 'description'})

    if desc is None:
        return None

    content = desc['content']
    return content.encode('UTF-8') + ' ' + str(tag)
def get_news_text_from_html(data):
  """ Given a string of data, locate the span that has the id "textstire" and
  that ends in </span>. It needs to support nested spans.

  Arguments:
    data: A string with the entire html page.
  Returns:
    A string with just the content text.
  """
  # From the data, get just the content. I don't quite understand why this
  # didn't work with a regular expression.
  data = replace_circ_diacritics(data)

  try:
    soup = BeautifulSoup(data)
  except HTMLParseError:
    return 'error'

  tag = soup.find('div', id="article_text_content")
  desc = soup.find('meta', {'name': 'description'})

  if desc is None:
    return None

  content = desc['content']
  return content.encode('UTF-8') + ' ' + str(tag)
def get_news_text_from_html(data):
  """ Given a string of data, locate the content.

  Arguments:
    data: A string with the entire html page.
  Returns:
    A string with just the content text.
  """
  # From the data, get just the content. I don't quite understand why this
  # didn't work with a regular expression.
  data = replace_circ_diacritics(data)
  data = replace_html_comments(data)

  try:
    soup = BeautifulSoup(data)
  except HTMLParseError:
    return 'error'

  tag = soup.find('div', {'id': 'articleContent'})
  if tag is None:
    return "error: article not found"

  script = tag.findNext('script', { 'type': 'text/javascript'})
  if script is not None:
    script.extract()

  #tag.findNext('div', {'class': 'tool_back'}).extract()
  #links = tag.findNext('div', {'class': 'links'})
  #if links is not None:
  #  links.extract()

  return str(tag)
Example #4
0
def get_news_text_from_html(data):
    """ Given a string of data, locate the content.

  Arguments:
    data: A string with the entire html page.
  Returns:
    A string with just the content text.
  """
    # From the data, get just the content. I don't quite understand why this
    # didn't work with a regular expression.
    data = replace_circ_diacritics(data)
    data = replace_html_comments(data)

    try:
        soup = BeautifulSoup(data, "lxml")
    except HTMLParseError:
        return 'error'

    tag = soup.find('div', {'id': 'articleContent'})
    if tag is None:
        return "error: article not found"

    script = tag.findNext('script', {'type': 'text/javascript'})
    if script is not None:
        script.extract()

    #tag.findNext('div', {'class': 'tool_back'}).extract()
    #links = tag.findNext('div', {'class': 'links'})
    #if links is not None:
    #  links.extract()

    return str(tag)