Beispiel #1
0
  def test_fetch_page(self):
    url = 'http://cn.wsj.com/gb/20100710/bus094831.asp?source=rss'
    accessed_url = url[:url.rfind('/') + 1] if url[-1] != '/' else url[:url[:-1].rfind('/') + 1]
    print accessed_url
    cut_content_from = '<!content_tag txt>'
    cut_content_to = '<!/content_tag txt>'
    resp = urllib2.urlopen(url)
    html_content = resp.read()

    charset = fetchpage.get_charset(html_content)
    content_tag_start = html_content.find(cut_content_from)
    content_tag_end = html_content.find(cut_content_to, content_tag_start)
    content = html_content[content_tag_start + len('<!content_tag txt>'):content_tag_end].decode(charset)
    content = remove_scriptag(content, 'script')
    content = replace_url(content, accessed_url)
    print content
Beispiel #2
0
 def test_get_page(self):
   print 'start'
   url = 'http://cn.wsj.com/gb/20100721/rth080855.asp?source=rss'
   html_content = urllib2.urlopen(url).read()
   cut_content_from = '<!content_tag txt>'
   cut_content_to = '<!/content_tag txt>'
   
   charset = fetchpage.get_charset(html_content)
   content_tag_start = html_content.find(cut_content_from)
   content_tag_end = html_content.find(cut_content_to, content_tag_start)
   content = html_content[content_tag_start + len(cut_content_from):content_tag_end].decode(charset, 'ignore')
   content = fetchpage.remove_scriptag(content)
   accessed_url = url[:url.rfind('/') + 1] if url[-1] != '/' else url[:url[:-1].rfind('/') + 1]
   content = fetchpage.replace_url(content, accessed_url)
   
   print 'finished'
   print content
   print 'end'
   pass