class LLXSCrawler(Crawler): def __init__(self): Crawler.__init__(self) self.parser = LLXSParser() def crawl(self, url): #get novel page html = utils.http_get(url, encode="gbk") html = self.parser.to_utf8(html) #html = utils.gbk_to_utf8(html) novel = self.parser.parse_novel_page(url, html) list_url = novel['list_url'] html = utils.http_get(list_url, encode='gbk') html = self.parser.to_utf8(html) chapter_list = self.parser.parse_list_page(list_url, html) novel['chapter_list'] = chapter_list novel["chapters"] = len(chapter_list) novel['update_time'] = time.time() novel["last_chapter"] = chapter_list[-1]["url"] return novel def crawl_content(self, url): html = utils.http_get(url, encode='gbk') html = self.parser.to_utf8(html) content = self.parser.parse_content_page(url, html) return content
def __init__(self): Crawler.__init__(self) self.parser = LLXSParser()