def downloadSection(self, sectionInfo): sectionCount = len(sectionInfo.chapters) Log.I("[I] on downloadSection() enter will download section count %s" % (str(sectionCount))) self.sectionDownloadSuccCount = 0 self.sectionDownloadFailedCount = 0 toDir = sectionInfo.bookInfo.uniqueKey for i in range(0, sectionCount): self.downloadOneSection(i, sectionInfo.chapters[i], toDir) Log.I( "[I] on downloadSection() exit download all section(%s) succ section(%s) failed section(%s)" % (str(sectionCount), str(self.sectionDownloadSuccCount), str(self.sectionDownloadFailedCount)))
def downloadBookImg(self, url, toDir): Log.I("[I] dowlonad bookImg " + url) uniqueKey = Utils.md5str(url) self.storge.checkFileExists( uniqueKey, toDir, lambda exists: not exists and self._downloadBookImg( url, uniqueKey, toDir))
def sectionInfo(self, bookInfo, muluUrl, bookMuluSoup): Log.I("[I] on get sectionInfo"); model = SectionInfoModel(); model.bookInfo = bookInfo; muluList = bookMuluSoup.find(lambda tag: tag.name == "ul" and tag.has_attr("class") and tag["class"][0] == "mulu_list"); if muluList == None or muluList.contents == None or len(muluList.contents) <= 0: return None; setted = False; for c in muluList.contents: atag = c.find("a"); if atag != -1: href = Utils.absoluteUrl(atag["href"], muluUrl, None) ; title = atag.string; if href != None and title != None: model.addChapter(str(href), str(title)); setted = True; else: Log.W(" on getSection found invalid tag " + str(atag) + ", href=" + str(href) + ",title=" + str(title)); if not setted: return None; return model;
def onDownloadSectionCompleted(self, idx, uniqueKey, succ): Log.I("[I] download section(%s) completed succ(%s)" % (str(idx), str(succ))) if succ: self.sectionDownloadSuccCount += 1 self.chapterDb.setDownloaded(uniqueKey, 1) else: self.sectionDownloadFailedCount += 1 self.chapterDb.setDownloaded(uniqueKey, 2)
def executeSql(self, sql): try: self.cursor.execute(sql); Log.I("[I] 执行 " + sql.strip() + " 成功"); return True; except Exception as e: if not isinstance(e, pymysql.err.IntegrityError) or len(e.args) <= 0 or e.args[0] != 1062: Log.E("[I] 执行 " + sql.strip() + " 失败"); Log.Exc(e); return False;
def downloadOneSection(self, idx, oneSectionModel, toDir): Log.I("[I] downloading section(%s) (%s) (%s)" % (str(idx), str( oneSectionModel.title), str(oneSectionModel.downUrl))) uniqueKey = oneSectionModel.uniqueKey url = oneSectionModel.downUrl self.storge.checkFileExists( oneSectionModel.uniqueKey, toDir, lambda exists: not exists and (self._downloadOneSection(idx, url, uniqueKey, toDir) or True ) or self.onDownloadSectionCompleted(idx, uniqueKey, True))
def visit(self, url): Log.I("[I] on visit() " + str(url)) self.setVisitingUrl(url) soup = Utils.soupUrl(url) if not soup: Log.W("[W] on visit() soup is None " + str(url)) return Log.I("[I] on visit() did get soup") #将本页所有url放入数据库中 urls = self.addUrlsFromSoup(soup, url) if not Utils.isValidArr(urls): Log.W("[W] on visit() urls not found") return #获取匹配的书页 bookUrls = self.addBookPageUrls(urls) #遍历书页 if len(bookUrls) > 0: for bookUrl in bookUrls: self.downloadBook(bookUrl) self.removeVisitUrl(url) Log.I("[I] on visit() finished " + str(url))
def execute(self): # self.test(); #如果在数据库中能找到visitUrl,说明程序已经运行过了,可以从数据库中恢复现场 Log.V("[I] onExecute()") #检查是否有visiting的book visitingBookUrl = self.visitingBookUrl() if visitingBookUrl != None: self.downloadBook(visitingBookUrl) #检查是否有visiting的url visitingUrl = self.visitingUrl() if visitingUrl != None: self.visit(visitUrl) #否则从root开始搜索 visitUrl = self.nextVisitUrl() self.visit(visitUrl) Log.I("[I] willVisitNext") #处理下一个 while True: visitUrl = self.nextVisitUrl() if visitUrl != None: self.visit(visitUrl) else: break Log.I("[I] willVisitNextPageUrl") #网页遍历完成 while True: bookUrl = self.nextPageUrl() if bookUrl != None: self.downloadBook(bookUrl) else: break Log.V("------parse finished------")
def chapterContent(self, chapterSoup): tag = chapterSoup.find(id = "htmlContent"); if tag == None: return None; content = ""; for c in tag.contents: if Utils.isSoupStr(c): part = c; part = part.strip(); if "全本小说" in part: Log.I("[W] ignore line " + str(part)); continue; if len(part) > 0: content += part + "\n"; return content;
def bookInfo(self, bookPageSoup, bookMuluSoup): Log.I("[I] on get bookInfo "); model = BookInfoModel(); #检查书页中的tlj标签 tLJTags = Utils.findAllClassTag(bookPageSoup, "div", "tLJ"); for tag in tLJTags: self._checkTLJTag(tag, model); #字数 find = Utils.findAll(r"已写了(\d+)字", str(bookPageSoup)); if find and len(find) > 0: model.wordsCount = find[0]; #检查目录页中的数据 metaTags = bookMuluSoup.find_all(lambda t: t.name == "meta" and t.has_attr("property") and Utils.isMatch("og:.+?\"", t["property"]) != None); for tag in metaTags: self._checkMetaTag(tag, model); return model;
def downloadBook(self, url): Log.V("[I] on begin downloadBook() " + str(url)) if url == None: return #检查是否下载过了 if self.checkDownloadedBookUrl(url): return #设置visiting self.setVisitingBookUrl(url) #尝试获取bookInfo bookInfo = self.getBookinfo(url) sectionInfo = None existsBookId = None if bookInfo != None: existsBookId = bookInfo.bookId Log.D(" downloadBook existsBookId = " + str(existsBookId)) sectionInfo = self.getChapter(bookInfo.bookId) Log.D(" downloadBook sectionInfo = " + str(sectionInfo)) if sectionInfo == None or sectionInfo.chapters == None or len( sectionInfo.chapters) == 0: sectionInfo = None bookInfo = None else: sectionInfo.bookInfo = bookInfo if bookInfo == None: Log.I("[I] on downloadBook() will get soup " + url) bookSoup = Utils.soupUrl(url) Log.I("[I] on downloadBook() did get soup (%s) %s " % (str(bookSoup != None), str(url))) if bookSoup != None: Log.I("[I] on downloadBook() will get muluSoup") muluUrl = self.bookMuluUrl(bookSoup) if muluUrl != None: muluSoup = Utils.soupUrl(muluUrl) Log.I("[I] on downloadBook() did get muluSoup %s" % (str(muluSoup != None))) if muluSoup != None: bookInfo = self.bookInfo(bookSoup, muluSoup) Log.I("[I] on downloadBook get bookInfo " + str(bookInfo)) if bookInfo != None: bookInfo.setUniqueKey() bookInfo.downBookUrl = url bookInfo.downMuluUrl = muluUrl #bookId if existsBookId == None: Log.D(" downloadBook will create new bookId") BookId.init(self.kvDb) bookInfo.bookId = BookId.nextBookId() else: Log.D(" downloadBook use exists bookId") bookInfo.bookId = existsBookId #获取章节信息 sectionInfo = self.sectionInfo( bookInfo, muluUrl, muluSoup) if sectionInfo != None: Log.D( " downloadBook parse sectionInfo success") #最新章节 bookInfo.chapterCount = len( sectionInfo.chapters) else: Log.D( " downloadBook error cant parser sectionInfo" ) bookInfo.status = BookInfoStatus.Error bookInfo.downloadStatus = BookDownloadStatus.Completed Log.D(" downloadBook save BookInfo " + str(bookInfo) + ", bookId = " + str(bookInfo.bookId) + ", save chapters " + str(sectionInfo)) #保存bookInfo self.saveBookinfo(bookInfo) self.saveChapter(bookInfo.bookId, sectionInfo) #下载bookImg if bookInfo != None and bookInfo.bookImg != None: self.downloadBookImg(bookInfo.bookImg, bookInfo.uniqueKey) if sectionInfo != None: self.downloadSection(sectionInfo) self.setDownloadedForBookUrl(url) self.chapterDb = None #移除visiting book url self.removeBookUrl(url) Log.V("on finished downloadBook() " + str(url))