def poem_list_crew(self): for i in range(1, 6): url = 'http://www.haoshiwen.org/type.php?x=%d' % i content = Downloader.get_html(url, 'poemlist') if content: page_count = Analyzer.get_page_count(content) # 分析 for j in range(1, page_count + 1): page_url = 'http://www.haoshiwen.org/type.php?x=%d&page=%d' % (i, j) # 入库 self.db.insert_url(page_url, 1) # 判断是否分析过 if self.db.url_analyzed(page_url): pass else: content = Downloader.get_html(page_url, 'poemlist') if content: # 分析诗的列表 poems = Analyzer.get_poems_from_list_page(content) if poems: # 入库 self.db.insert_urls(poems, 2) self.db.update_url(page_url) print '%d %d/%d: %s' % (i, j, page_count, page_url) else: if Analyzer.check_poem_list_last_page(content): # 最后一页 break else: print u'分析失败' self.db.insert_error('analyze_poem_list_error', 3, 'reason', page_url) # 错误入库:analyze_poem_list_error else: print u'获取页面诗词列表错误' self.db.insert_error('get_poem_list_error', 2, 'reason', page_url) # 错误入库:get_poem_list_error else: print u'分析首页失败' self.db.insert_error('analyze_poem_list_first_page_error', 1, 'reason', page_url)