Ejemplo n.º 1
0
    def fetch_save_page(self, root, host, path):
        html = fetchlib.fetch_html(host, path)
        parser = TianyaBookContentHTMLParser()
        parser.feed(html)
        if parser.content is None:
            print html
            print 'content is None'
            return

        f = open(root+'/'+parser.title+'.html', 'w')
        with closing(f):
            f.write(parser.content)
Ejemplo n.º 2
0
    def fetch_recursive(self, root, host, path):
        if path in self.walked:
            print 'fetched', path, 'skip'
            return

        # fetch
        html = fetchlib.fetch_html(host, path)
        html = fetchlib.Cache().get('html').decode('gbk').encode('utf-8')

        # name
        name = os.path.basename(path)
        if len(name) == 0:
            name = 'index.html'

        if self.is_page(html):
            parser = TianyaBookContentHTMLParser()
            parser.feed(html)
            if parser.content is None:
                print html
                print 'content is None'
                return

            inner = '<h1>' + parser.title + '</h1>\n' + parser.content
            file_put_contents(root+'/'+name, inner)
            return
        else:
            inner = extract_body_inner_html(html)

        file_put_contents(root+'/'+name, inner)

        parser = CurrentPageHrefHTMLParser()
        parser.feed(html)
        basepath = os.path.dirname(path)
        for x in parser.links:
            href = x['href']
            if not ( href == 'index.html' or href == 'index.htm' ):
                print x['title'],
                self.fetch_recursive(root, host, basepath + '/' + href)