# url = bookid + url yield url, tools.to_utf8(title, encoding) def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n' * 2 + text if __name__ == '__main__': text = file('2100book_a.html', 'rb').read() print 'title=', tools.to_encode(parse_title(text)) print 'index=' i = 0 for url, title in parse_index(text): print tools.to_encode(title), url i += 1 print 'total=', i print 'page=' print tools.to_encode(parse_page('title', file('2100book_b.html').read()))
s = [] for (url, title) in r_index.findall(text): title = title.replace(' ', ' ').strip() yield url, tools.to_utf8(title, encoding) def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n'*2 + text if __name__ == '__main__': text = file('bookqq_a.html', 'rb').read() print 'title=', tools.to_encode(parse_title(text)) print 'index=' i = 0 for url, title in parse_index(text): print tools.to_encode(title), url i += 1 print 'total=', i print 'page=' print tools.to_encode(parse_page('title', file('bookqq_b.html').read()))