def parseReplys(reply_area, comment, user_id): abbrs = reply_area.findAll("abbr") for abbr in abbrs: tag_withID = abbr.parent.parent.parent reply = Entity() reply.parent = comment reply.postId = toMD5('{}_{}'.format(comment.postId, tag_withID.get("id"))) reply.rid = comment.parent.postId h3_authorName = tag_withID.find("h3") reply.authorName = h3_authorName.getText() reply.content = h3_authorName.next_sibling.getText() if reply.content.__eq__(""): reply.content = "圖片" reply.articleDate = speculateArticlePostDate( tag_withID.find("abbr").getText()) react_like_tag = tag_withID.find("abbr").parent.find( "span", {'id': re.compile('^like_.*')}) ufi_url = react_like_tag.find( "a", {"href": re.compile('^/ufi/reaction/profile/browser/.*')}) if ufi_url: setReplyAttr(reply, generateMFBUrl(ufi_url.get("href"))) print('寫入回應 : ', reply.toMap()) more = reply_area.find("div", {"id": re.compile("^comment_replies_more_1.*")}) if more: more_url = generateMFBUrl(more.a.get("href")) resp = sendRequest(more_url) soup = BeautifulSoup(resp.text, features='lxml') main_comment = soup.find("div", id=user_id) reply_area = main_comment.next_sibling parseReplys(reply_area, comment, user_id)
def parseArticle(url): resp = sendRequest(url) soup = BeautifulSoup(resp.text, features='lxml') article = Entity() article.url = str(re.search("https://.*refid=17", url).group()[0:-1]) article.postId = toMD5(url) article.rid = article.postId authorName = soup.findAll("table", {"role": "presentation"})[2].getText() article.articleDate = speculateArticlePostDate(soup.find("abbr").getText()) if authorName: article.authorName = re.sub("查看編輯紀錄", "", authorName) else: article.authorName = "???" article.content = soup.find("title").getText() title = re.search("^.*[??!!。~~.]+", article.content) if title: article.title = title.group() else: article.title = article.content[0:20] replyBar = soup.find("div", { "id": "add_comment_switcher_placeholder" }).next_sibling.a replyBarUrl = generateMFBUrl(replyBar.get("href")) setReplyAttr(article, replyBarUrl) print(article.toMap()) article.setAttr("soup", soup) return article
def processCommentAndReply(user_id, replyUrl, comment): randomSleep() resp = sendRequest(replyUrl) soup = BeautifulSoup(resp.text, features='lxml') main_comment = soup.find("div", id=user_id) h3_authorName = main_comment.find("h3") comment.postId = toMD5('{}_{}'.format(comment.parent.url, user_id)) comment.rid = comment.parent.postId comment.authorName = h3_authorName.getText() comment.content = h3_authorName.next_sibling.getText() if comment.content.__eq__(""): comment.content = "圖片" print("留言時間: ", main_comment.find("abbr").getText()) comment.articleDate = speculateArticlePostDate( main_comment.find("abbr").getText()) react_like_tag = main_comment.find("abbr").parent.find( "span", {'id': re.compile('^like_.*')}) ufi_url = react_like_tag.find( "a", {"href": re.compile('^/ufi/reaction/profile/browser/.*')}) if ufi_url: setReplyAttr(comment, generateMFBUrl(ufi_url.get("href"))) print('寫入留言 : ', comment.toMap()) reply_area = main_comment.next_sibling parseReplys(reply_area, comment, user_id)
def getCrawablePage(self, url) -> BeautifulSoup: print("frontPage url: ", url) randomSleep() resp = session.get(url) session.cookies.save() resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, features='lxml') pageBar = soup.find("a", text="顯示更多") articles = soup.findAll("div", {"role": "article"}) for a in articles: try: abbr = a.find("abbr").getText() postDate = speculateArticlePostDate(abbr) print("postDate = ", postDate) if postDate >= txDate: target = { 'url': generateMFBUrl(a.find("a", text="完整動態").get("href")) } print(target) else: pageBar = None break except Exception: continue return pageBar
def getCrawablePage(self, url) -> BeautifulSoup: print("frontPage url: ", url) randomSleep() resp = session.get(url) session.cookies.save() resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, features='lxml') pageBar = getNextPageBar(soup) articles = soup.findAll("article", id=re.compile("^u_0_[0-9]$")) for a in articles: try: abbr = a.find("abbr").getText() postDate = speculateArticlePostDate(abbr) if postDate is None: continue if postDate >= txDate: target = { 'url': generateMFBUrl(a.find("a", text="完整動態").get("href")) } urlList.append(target) print("取得文章 postDate: ", postDate, " 取得文章 url: ", target) else: pageBar = None break except Exception: continue return pageBar
def parseComments(article): soup = article.getAttr("soup") comment_tag = soup.find("div", { "id": "add_comment_link_placeholder" }).previous_sibling abbrs = comment_tag.findAll("abbr") for abbr in abbrs: user_comment = abbr.parent.parent.parent user_id = user_comment.get("id") replyUrl = generateMFBUrl( user_comment.find("a", text="回覆").get("href")) comment = Entity() comment.parent = article processCommentAndReply(user_id, replyUrl, comment) more = comment_tag.find("div", {"id": re.compile("^see_next_.*")}) if more: more_url = generateMFBUrl(more.a.get("href")) randomSleep() resp = sendRequest(more_url) soup = BeautifulSoup(resp.text, features='lxml') article.setAttr("soup", soup) parseComments(article)
def getNextPage(self, pageBar) -> str: url = generateMFBUrl(pageBar.get("href")) return url