def fetch_save_page(self, root, host, path): html = fetchlib.fetch_html(host, path) parser = TianyaBookContentHTMLParser() parser.feed(html) if parser.content is None: print html print 'content is None' return f = open(root+'/'+parser.title+'.html', 'w') with closing(f): f.write(parser.content)
def fetch_recursive(self, root, host, path): if path in self.walked: print 'fetched', path, 'skip' return # fetch html = fetchlib.fetch_html(host, path) html = fetchlib.Cache().get('html').decode('gbk').encode('utf-8') # name name = os.path.basename(path) if len(name) == 0: name = 'index.html' if self.is_page(html): parser = TianyaBookContentHTMLParser() parser.feed(html) if parser.content is None: print html print 'content is None' return inner = '<h1>' + parser.title + '</h1>\n' + parser.content file_put_contents(root+'/'+name, inner) return else: inner = extract_body_inner_html(html) file_put_contents(root+'/'+name, inner) parser = CurrentPageHrefHTMLParser() parser.feed(html) basepath = os.path.dirname(path) for x in parser.links: href = x['href'] if not ( href == 'index.html' or href == 'index.htm' ): print x['title'], self.fetch_recursive(root, host, basepath + '/' + href)