def singleBehavior(postID: int, savedFileName: str, seeLZ=True, imageBed=True): Avalon.debug_info('创建任务,ID:%d' % postID) if os.path.isfile(savedFileName): Avalon.time_info('帖子已经下载并保存,跳过[ID:%d]' % postID) return True post = api(postID=postID, seeLZ=seeLZ) try: postInfo = post.getInfo() except FileNotFoundError: return False except SystemExit: return False Avalon.time_info('开始任务:"%s"(作者:%s)[ID:%d]' % (postInfo['Title'], postInfo['Author'], postID), highlight=True) Avalon.debug_info(postInfo) lastContext = [] for i in range(1, postInfo['TotalPage'] + 1): Avalon.time_info('开始第%d页,共%d页' % (i, postInfo['TotalPage'])) try: pageHTMLContent = post.getContent(i) pageMarkdownContent = post.contentToMarkdown(pageHTMLContent, useImageBed=imageBed) except KeyboardInterrupt: Avalon.critical('用户强制退出') quit(1) except SystemExit: pass else: lastContext.append(pageMarkdownContent) lastContext = ''.join(lastContext) post.saveToFile(savedFileName, lastContext) return True
def getPost(self: None, pageNumber: int, ajax: bool = True, useTemp: bool = True): # 获得html源文件函数 self.__workPageNumber = pageNumber link = self.__postLink + str(pageNumber) existTemp = self.__tempSave.getSameTemp() if existTemp.get('html') and useTemp: for i in existTemp['html']: if int(i[1]) == int(pageNumber): Avalon.debug_info('第%d页已经在临时文件中存在,跳过' % pageNumber) return self.__tempSave.readFileByID(i) if ajax is False: link = link.replace('ajax=1&', '') for tryTimes in range(1, 11): try: postRequest = request.Request(link) try: # 设置程序请求头,伪装爬虫(必要性存疑) postRequest.add_header('User-Agent', (random.choice( self.__userAgent)).replace('\n', '')) postRequest.add_header('Referer', 'https://tieba.baidu.com') except: continue else: postRead: bytes = request.urlopen(postRequest, timeout=5).read() if self.debug: Avalon.debug_info('链接:"%s"请求头:%s.' % (link, postRequest.headers)) # 错误处理 except error.URLError as e: Avalon.warning("获取帖子正文失败!原因:%s(%s/10)" % (str(e.reason), str(tryTimes))) except timeout as e: Avalon.warning("获取帖子正文失败!原因:%s(%s/10)" % (str(e), str(tryTimes))) except KeyboardInterrupt: Avalon.critical("用户强制退出") quit(1) except: Avalon.warning("获取帖子正文失败!原因:未知错误(%s/10)" % tryTimes) # 没有错误,结束循环 else: if self.debug: Avalon.debug_info('Link %s Get Successed.' % link) break else: Avalon.error('获取失败!') if self.debug: Avalon.debug('Link:%s' % link) quit(1) if useTemp is True: self.__tempSave.savePostRaw(postRead.decode(errors='ignore'), pageNumber=pageNumber) return (postRead.decode(errors='ignore'))
def proccessPost(self: None, raw: str, useTemp: bool = True): # 将源文件转换为dict类型的数据 # 如果你没有读过百度贴吧帖子的html源文件,那么你就不要往下看了 # 看了你也看不明白 postEtree = etree.HTML(raw) theradList = postEtree.xpath( '//div[@class="p_postlist"]/div[@class="p_postlist"]/div[@class]') if theradList == []: Avalon.critical('程序无法正确获得文章内容') quit(1) finalList = [] if self.debug: Avalon.debug_info('帖子内容获取成功') for perFloor in theradList: floorDict = {} # 更改了Xpath的匹配方式 if self.debug: debugText = html.unescape(etree.tostring(perFloor).decode()) debugText.replace('', '') if not perFloor.xpath('./@data-field'): if self.debug: Avalon.debug_info('因为不存在"data-field"属性,跳过对象"%s"' % str(perFloor)) continue floorNum = json.loads( perFloor.xpath('./@data-field')[0])['content']['post_no'] author = json.loads( perFloor.xpath('./@data-field')[0])['author']['user_name'] text = perFloor.xpath('.//cc//div[@id]') if not text: #Avalon.debug_info(str(floorNum)+str(author)) continue else: text = text[0] final_text = html.unescape(etree.tostring(text).decode()) if self.debug: Avalon.debug_info('%s - %s' % (floorNum, author)) floorDict['floor'] = int(floorNum) floorDict['author'] = author floorDict['text'] = final_text finalList.append(floorDict) #postFullInfo = self.postInfo() postFullInfo = self.postInfo postFullInfo['Data'] = finalList if useTemp is True: self.__tempSave.saveJson(postFullInfo, self.__workPageNumber) return (postFullInfo)
def multiThreadGetMain(self, threadNumber: int = 8): workQueue = queue.Queue(threadNumber) threadLock = threading.Lock() exitFlag = False threadList = [] def mainFloorThread(): while not exitFlag: threadLock.acquire() if not workQueue.empty(): pageNumber = workQueue.get() threadLock.release() self.__getPostBehavior(pageNumber) else: threadLock.release() time.sleep(1) for i in range(threadNumber): threadName = 'PostThread #%s' % i newThread = threading.Thread(target=mainFloorThread) newThread.setName(threadName) newThread.start() threadList.append(newThread) self.__getPostBehavior(1) dbRead = self.__db.checkExistPage(1)[1] if not dbRead: Avalon.critical('Can\'t Get Page 1,Program Exit!') quit(1) totalPages = int(json.loads(dbRead)['page']['total_page']) for i in range(totalPages): workQueue.put(i + 1) while not workQueue.empty(): time.sleep(1) exitFlag = True for i in threadList: i.join() Avalon.info('[%s]Get All Pages Success' % self.__tid)
def getPostInfo(self: None): postRaw = self.getPost(pageNumber=1, ajax=False, useTemp=False) postGet = etree.HTML(postRaw) if postGet.xpath('//body[@class="page404"]'): Avalon.error('此贴已经被删除!') raise FileNotFoundError postTitle = postGet.xpath('//div[@class="wrap2"]//h3/@title') postAuthor = postGet.xpath( '//div[@class="p_postlist"]/div[@class][1]//div/@author') postPageNum = postGet.xpath( '//div/div[@id]//div[@id]//li/span[@class="red"][last()]/text()') if not (postTitle and postAuthor and postPageNum): Avalon.critical('程序无法正确获得帖子信息') quit(1) finalInfo = { 'Author': str(postAuthor[0]), 'Title': str(postTitle[0]), 'TotalPage': int(postPageNum[0]) } self.postInfo = finalInfo if self.debug: Avalon.debug_info(self.postInfo) return finalInfo
def __bytes__(self): #返回bytes类型的表单数据,通过data参数post提交 buffer = io.BytesIO() boundary = b'--' + self.boundary + b'\r\n' #增加表单项 for name, value in self.form_fields: buffer.write(boundary) buffer.write(self._form_data(name)) buffer.write(b'\r\n') buffer.write(value.encode()) buffer.write(b'\r\n') #增加表单文件 for f_name, filename, f_content_type, body in self.files: buffer.write(boundary) buffer.write(self._attached_file(f_name, filename)) buffer.write(self._content_type(f_content_type)) buffer.write(b'\r\n') buffer.write(body) buffer.write(b'\r\n') buffer.write(b'--' + self.boundary + b'--\r\n') return buffer.getvalue() if __name__ == "__main__": Avalon.critical('模块非法调用!请运行Main.py!') quit(1)
else: break Avalon.debug_info('程序已经启动...正在获取帖子信息') post = api(postID=postID, seeLZ=onlySeeLZ, debug=GENERAL_DEBUG_MODE) postInfo = post.getInfo() Avalon.time_info('开始任务:"%s"(作者:%s)' % (postInfo['Title'], postInfo['Author']), highlight=True) for i in range(1, postInfo['TotalPage'] + 1): Avalon.time_info('开始第%d页,共%d页' % (i, postInfo['TotalPage'])) try: pageHTMLContent = post.getContent(i) pageMarkdownContent = post.contentToMarkdown(pageHTMLContent, useImageBed=USE_IMAGE_BED) except KeyboardInterrupt: Avalon.critical('用户强制退出') quit(1) else: post.saveToFile(fileName, pageMarkdownContent) """ markdown = markdown() image = image(debug=GENERAL_DEBUG_MODE) fullBehavior(postID=postID,fileName=fileName,onlySeeLZ=onlySeeLZ) def fullBehavior(postID,fileName,onlySeeLZ=True): posts = spider(postID=int(postID), seeLZ=onlySeeLZ) info = posts.getPostInfo() Avalon.info('帖子标题:%s,帖子作者%s.' % (info['title'], info['author']))