def updateNovel(self): novels = self.getNovelListFromConfig() Log.info("updateNovel " + str(novels)) if len(novels) > 0: for index, novel in enumerate(novels): if systemCode.baseUrl in novel.baseUrl(): ## 陛下文学网 print(u"download source page ") Log.info(u"download source page ") self.analysisNovelInfo(self.getNovel(novel), novel) print(u"parse source page ") Log.info(u"parse source page ") novelUrl = novel.baseUrl() fileName = novel.fileName() localFolder = novel.localFolder() novelNo = novel.novelNo() fileTools = FileTools(localFolder + u'/' + fileName) content = fileTools.readFile() bixiaParse = BiXiaWenXueParse(content) bixiaParse.setUrl(novelUrl) bixiaParse.setLocalFolder(localFolder) bixiaParse.setNovelNo(novelNo) ##novelNo bixiaParse.parse() else: Log.error("now unsupport this network "+novel) Log.info("all over %s" %(ctime()))
def analysisChapterInfo(self, no='', title='', chapterSourceUrl=''): Log.info("analysisChapterInfo start") fileTools = FileTools(self.localFolder + '/' + systemCode.oneNovelAllChaptersInfoFile) split = systemCode.fileContentSplit info = no + split + title + split + chapterSourceUrl + '\r\n' fileTools.fileWriteAppend(info) Log.info("analysisChapterInfo end")
def getChapter(self, novelNo, chapterNo, chapterTitle): Log.info("getChapter novelNo [ %s ] chapterNo [ %s ] " " chapterTitle [ %s ] " % (novelNo, chapterNo, chapterTitle)) fileTools = FileTools(systemCode.baseFolder + u'/SourceUrlFile/' + novelNo + u'/' + chapterNo + chapterTitle + u'.n') content = fileTools.readFile() result = ResponseChapterContent(content) return ObjectJson.convert_to_dict(result)
def parse(self): contentList = self.content.decode('utf-8').split('\r\n') for index, raw in enumerate(contentList): if '<dd>' in raw and u'月票' not in raw and u'推迟' not in raw and u'第' in raw: raw = raw.replace(u'掌', u'章') #修改错别字 raw = raw.replace(u':', u' ') #修改错别字 raw = raw.replace(u'?', u'') #修改错别字 ##<dd> <a style="" href="/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> to #/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> chapterList = raw.split('href="') ##/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> to #/27_27047/2325169.html # 第七百七十六章 都送上路</a></dd> tmpList = chapterList[1].split('">') link = tmpList[0] ##第七百七十六章 都送上路</a></dd> to #第七百七十六章 都送上路 tmpTitleList = tmpList[1].split('</a>') ##第七百七十六章 都送上路 #第七百七十六章都送上路 nospaceString = tmpTitleList[0].replace(' ', '') ##第七百七十六章都送上路 #第五百一十八 # 海底宝藏 noAndTitle = nospaceString.split(u'章') no = noAndTitle[0] + u'章' title = noAndTitle[1] self.writeToFile(link, no, title) elif '<dd>' in raw and u'月票' not in raw and u'推迟' not in raw: raw = raw.replace(u'?', u'') #修改错别字 raw = raw.replace(u':', u' ') #修改错别字 ##<dd> <a style="" href="/167_167729/8536701.html">1 入职</a></dd> to #/27_27047/2325047.html">1 入职</a></dd> chapterList = raw.split('href="') ##/27_27047/2325047.html">1 入职</a></dd> to #/27_27047/2325169.html #1 入职</a></dd> tmpList = chapterList[1].split('">') link = tmpList[0] ##1 入职</a></dd> to #1 入职 tmpTitleList = tmpList[1].split('</a>') ##1 入职 #1 #入职 noAndTitle = tmpTitleList[0].split(' ') if len(noAndTitle) == 2: no = noAndTitle[0] title = noAndTitle[1] self.writeToFile(link, no, title) else: Log.error("parse chapter[ %s ]no title except is 2 !" % (raw)) else: Log.error(" parse chapter info is unsupport !") self.annalysisChapterSourceInfo(raw)
def getNovel(self,novel): url = novel.novelUrl() fileName = novel.fileName() localFolder = novel.localFolder() print(localFolder) Log.info("getNovel local [ %s ] fileName [ %s ] novelUrl [ %s ] "%(localFolder, fileName, url)) if not os.path.exists(localFolder): os.mkdir(localFolder) urlTools = UrlTools(url); header, content = urlTools.getUrlContent() fileTools = FileTools(localFolder+ u'/' +fileName); fileTools.writeNewFile(content) return content
def queryUserReadChapter(self, novelNo, id): Log.info("userGetChapter novelNo [ %s ] user id [%s]" % (novelNo, id)) fileTools = FileTools(systemCode.userReadNovelFile) readInfo = fileTools.readFile() split = systemCode.fileContentSplit list = readInfo.split('\r\n') result = [] for index, raw in enumerate(list): tmp = raw.split(split) if len(tmp) == 4: id = tmp[0] novelNo = tmp[1] chapterNo = tmp[2] chapterTitle = tmp[3] if id == id and novelNo == novelNo: Log.info("[ %s ] read [%s]" % (id, novelNo)) read = ResponseReadChapter(id, novelNo, chapterNo, chapterTitle) result.append(read) else: Log.info("[ %s ] not read [%s]" % (id, novelNo)) else: Log.error("[ %s ] len is not 4 is [%s]" % (raw, str(len(tmp)))) return ObjectJson.convert_to_dicts(result)
def userGetChapter(self, novelNo, chapterNo, chapterTitle, id): Log.info("userGetChapter novelNo [ %s ] chapterNo [ %s ] " " chapterTitle [ %s ] user id [%s]" % (novelNo, chapterNo, chapterTitle, id)) fileTools = FileTools(systemCode.baseFolder + u'/SourceUrlFile/' + novelNo + u'/' + chapterNo + chapterTitle + u'.n') content = fileTools.readFile() split = systemCode.fileContentSplit userReadInfo = id + split + novelNo + split + chapterNo + split + chapterTitle fileTools = FileTools(systemCode.userReadNovelFile) readInfo = fileTools.readFile() if userReadInfo not in readInfo: fileTools = FileTools(systemCode.userReadNovelFile) fileTools.fileWriteAppend(userReadInfo) result = ResponseChapterContent(content) return ObjectJson.convert_to_dict(result)
def writeToFile(self, link='', no='', title=''): url = self.url + link url = url.replace('//', '/') url = url.replace(':/', '://') fileName = self.localFolder + '/' + no + title + ".n" if not os.path.exists(fileName): # chapter not exist is need down # content is source page try: chapterPage = self.downLoad(url) except Exception, e: print e.message Log.error('one error download url ' + url + ' error info ' + e.message) sleep(30) try: chapterPage = self.downLoad(url) except Exception, e: sleep(120) Log.error('two error download url ' + url + ' error info ' + e.message) chapterPage = self.downLoad(url)
def getNovelListFromConfig(self): Log.info("getNovelListFromConfig") fileTools = FileTools(systemCode.downloadNovelsInfoFile) allNovelNos = fileTools.readFile() Log.info("download file info "+allNovelNos) novels = [] if allNovelNos != " ": list = allNovelNos.split('\r\n') list = novelsNo for index, raw in enumerate(list): if raw != "": baseUrl = systemCode.baseUrl novelUrl = baseUrl +u'/'+raw+u'/' item = DownLoadNovelItem(baseUrl, novelUrl) novels.append(item) Log.info("DownInfo is baseUrl[ %s ] novelUrl[ %s ] "%(baseUrl, novelUrl)) else: Log.error("DownInfo raw is null") else: Log.waring("DownInfo is null!") return novels
def annalysisChapterSourceInfo(self, raw): Log.info("annalysisChapterSourceInfo [%s] " % (raw)) ##<dd> <a style="" href="/167_167729/8536723.html">23 樱花的忍者</a></dd> if '<dd> <a' in raw: ####<dd> <a style="" href="/167_167729/8536723.html">23 樱花的忍者</a></dd> #<dd> <a style="" #/167_167729/8536723.html">23 樱花的忍者</a></dd> tmp = raw.split('href="') ##/167_167729/8536723.html">23 樱花的忍者</a></dd> #/167_167729/8536723.html # 23 樱花的忍者</a></dd> tmp1 = tmp[1].split('">') link = tmp1[0] titleName = tmp1[1].replace('</a></dd>', '') chapterSourceUrl = self.url + link chapterSourceUrl = chapterSourceUrl.replace('//', '/') chapterSourceUrl = chapterSourceUrl.replace(':/', '://') chapterSourceInfo = chapterSourceUrl + systemCode.fileContentSplit + titleName Log.info(chapterSourceInfo) fileTools = FileTools(systemCode.baseFolder + u'/SourceUrlFile/' + self.novelNo + u'/' + systemCode.oneNovelAllChaptersSourceInfo) allChaptersSource = fileTools.readFile() # if allNovels != "": if chapterSourceInfo not in allChaptersSource: fileTools1 = FileTools( systemCode.baseFolder + u'/SourceUrlFile/' + self.novelNo + u'/' + systemCode.oneNovelAllChaptersSourceInfo) fileTools1.fileWriteAppend(chapterSourceInfo) else: Log.info("annalysisChapterSourceInfo [%s] is already exist!" % (chapterSourceInfo))
def getChapterList(self, novelNo): fileTools = FileTools(systemCode.baseFolder + u'/SourceUrlFile/' + novelNo + u'/' + systemCode.oneNovelAllChaptersInfoFile) content = fileTools.readFile() chapters = [] if content != "": contentList = content.split('\r\n') for index, raw in enumerate(contentList): if '#' in raw: chapterInfoList = raw.split(systemCode.fileContentSplit) if len(chapterInfoList) == 3: chapterInfo = ResponseChapter(chapterInfoList[0], chapterInfoList[1], chapterInfoList[2]) chapters.append(chapterInfo) elif len(chapterInfoList) == 2: chapterInfo = ResponseChapter(chapterInfoList[0], chapterInfoList[1]) chapters.append(chapterInfo) else: Log.error("getChapterList novel %s len is not 2" % (raw)) else: Log.error("getChapterList content %s is error" % (raw)) else: Log.error("getChapterList content %s NULL") # Log.info("getNovels result "+chapters) return ObjectJson.convert_to_dicts(chapters)
def analysisNovelInfo(self, content, novel): Log.info("analysisNovelInfo start") split= str(systemCode.fileContentSplit) novelNo=novel.novelNo() name = '' url = novel.novelUrl() author = '' imageurl = '' lashUpdateTime = '' lastUpdateChapter = '' novelinfo = '' contentList = content.decode('utf-8').split('\r\n') for index, raw in enumerate(contentList): if '<h1>' in raw : name = raw.split('<h1>')[1].replace('</h1>', '') if '<p>作 者:' in raw : author = raw.split('者:')[1].replace('</p>', '') if '<img alt' in raw : imageurl = systemCode.baseUrl+raw.split('src="')[1].split('" width')[0] if '<p>最后更新:' in raw : lashUpdateTime = raw.split('更新:')[1].replace('</p>', '') if ' <p>最新更新:' in raw : lastUpdateChapter = raw.split('">')[1].replace('</a></p>', '') novelinfo = str(novelNo)+split+str(name)+ split +str(url)+ split +str(author)+ split \ +str(imageurl)+ split +str(lashUpdateTime)+ split +str(lastUpdateChapter) Log.info(novelinfo) fileTools = FileTools(systemCode.baseFolder+ u'/SourceUrlFile/'+systemCode.allNovelsNameInfoFile) allNovels = fileTools.readFile() # if allNovels != "": if novelNo not in allNovels: fileTools1 = FileTools(systemCode.baseFolder+ u'/SourceUrlFile/'+systemCode.allNovelsNameInfoFile) fileTools1.fileWriteAppend(novelinfo) else: Log.info("analysisNovelInfo [%s] is already exist!"%(novelinfo)) # allNovelsList = allNovels.split('\r\n') # for index,raw in enumerate(allNovelsList): # if name in raw: # tmpAllNovels += novelinfo+'\r\n'; # else: # tmpAllNovels += raw+'\r\n'; # else: tmpAllNovels = novelinfo+'\r\n'; Log.info("analysisNovelInfo end")
def addOneNovel(self, novelNo): ok = False Log.info("addOneNovel novelNo [ %s ] " % (novelNo)) url = systemCode.baseUrl + u'/' + novelNo + '/' urlTools = UrlTools(url) header, content = urlTools.getUrlContent() if '笔下文学' in content: fileTools = FileTools(systemCode.downloadNovelsInfoFile) content = fileTools.readFile() if novelNo not in content: fileTools = FileTools(systemCode.downloadNovelsInfoFile) fileTools.fileWriteAppend(novelNo + u'\r\n') Log.info("addOneNovel novelNo [ %s ] success " % (novelNo)) ok = True else: Log.info( "addOneNovel novelNo [ %s ] failed maybe not 笔下文学 or novelNo is error" % (novelNo)) return ok
def getNovels(self): fileTools = FileTools(systemCode.baseFolder + u'/SourceUrlFile/' + systemCode.allNovelsNameInfoFile) content = fileTools.readFile() novels = [] if content != "": contentList = content.split('\r\n') for index, raw in enumerate(contentList): if '#' in raw: novelInfoList = raw.split(systemCode.fileContentSplit) if len(novelInfoList) == 7: novelInfo = ResponseNovel( novelInfoList[0], novelInfoList[1], novelInfoList[2], novelInfoList[3], novelInfoList[4], novelInfoList[5], novelInfoList[6]) novels.append(novelInfo) else: Log.error("getNovels novel %s len is not 7" % (raw)) else: Log.error("getNovels content %s is error" % (raw)) else: Log.error("getNovels content %s NULL") return ObjectJson.convert_to_dicts(novels)
def downLoad(self, url): print("down load url ", url) Log.info('down load url ' + url) urlTools = UrlTools(url) header, content = urlTools.getUrlContent() return content
' error info ' + e.message) chapterPage = self.downLoad(url) self.analysisChapterInfo(no, title, url) fileTools = FileTools(fileName) # fileTools.writeNewFile(chapterPage) # chapter content contains others info #replace chapterPageList = chapterPage.decode('utf-8').split( '\r\n') # analysis the chapter content for index, raw in enumerate(chapterPageList): if '<br/>' in raw: # fileTools.writeNewFile(bytes(raw, encoding='utf-8')) python 3.5 raw = raw.decode('utf-8') fileTools.writeNewFile(bytes(str(raw))) else: print(fileName, "is already download!") Log.info(fileName + "is already download!") def parse(self): contentList = self.content.decode('utf-8').split('\r\n') for index, raw in enumerate(contentList): if '<dd>' in raw and u'月票' not in raw and u'推迟' not in raw and u'第' in raw: raw = raw.replace(u'掌', u'章') #修改错别字 raw = raw.replace(u':', u' ') #修改错别字 raw = raw.replace(u'?', u'') #修改错别字 ##<dd> <a style="" href="/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> to #/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> chapterList = raw.split('href="') ##/27_27047/2325047.html">第七百七十五章 专打老天才</a></dd> to #/27_27047/2325169.html # 第七百七十六章 都送上路</a></dd> tmpList = chapterList[1].split('">')