def main(): # 根据所带参数,确定使用哪个网站的配置参数 try: website = sys.argv[1] url = sys.argv[2] except Exception as e: print "please choose one website" exit() # 实例化 dic = { "qidian": Qidian, "heiyan": Heiyan, } config = dic[website]() # 获取关键信息 handler = Spider(config.title, config.content, config.next) chapters = config.getList(url) book = open("text.txt", "w") for item in chapters: print "正在下载->", item["title"] content = handler.getContent(item["href"]) book.writelines(item["title"] + "\n") book.writelines(content["content"] + "\n")
def startCB(self): # 保存内容的文件 file = open(self.filePath, "w") # 爬取得规则 titleKlass = {"class": "j_chapterName"} contentKlass = {"class": "j_readContent"} nextKlass = {"id": "j_chapterNext"} page = self.entryUrl.get() # 开始爬取 spider = Spider(titleKlass, contentKlass, nextKlass) if page == "" or self.filePath == "": tkMessageBox.showerror("woolson", "小说名称或链接未填写!") else: # 循环抓取下一章 while page != "": result = spider.getContent(page) try: page = result["nextUrl"] file.write(result["title"] + "\n") file.write(result["content"] + "\n\n") print "正在写入->" + result["title"] except Exception as e: page = "" print "结束", result["error"]