def get_items(self, stories): pattern = re.compile( r'<div class="msgBox".*?<div class="userName".*?title="(.*?)" gender=' + '.*?<div class="msgCnt">(.*?)</div>' + '.*?<div class="pubInfo.*?from="\\d*">(.*?)</a>', re.S) for story in stories: story, quotation = tencent_util.depart_quotation(story) items = re.findall(pattern, story) if not len(items): continue item = items[0] print('作者:', item[0]) self.document.add_heading('', 0) tencent_util.add_author(self.document, item[0]) print('内容:', item[1]) content_valid = ''.join(c for c in item[1] if ord(c) >= 32) # delete control chars tencent_util.add_content(self.document, content_valid) tencent_util.add_quotation(self.document, quotation) tencent_util.add_picture(self.document, story) tencent_util.add_video(self.document, story) print('时间:', item[2]) tencent_util.add_time(self.document, item[2]) tencent_util.add_location(self.document, story)
def get_items(self): for story in self.stories: story, quotation = tencent_util.depart_quotation( self.browser, story) author = story.find_element_by_class_name( 'userName').find_element_by_tag_name('a').get_attribute( 'title') print('作者:', author) content_html = story.find_element_by_class_name( 'msgCnt').get_attribute('innerHTML') content_valid = ''.join(c for c in content_html if ord(c) >= 32) # delete control chars content = story.find_element_by_class_name('msgCnt').text print('内容:', content) time = story.find_element_by_class_name('time').text print('时间:', time) print( '----------------------------------------------------------------------------------' ) self.document.add_heading('', 0) tencent_util.add_author(self.document, author) tencent_util.add_content(self.document, content_valid) tencent_util.add_quotation(self.document, quotation) tencent_util.add_picture(self.document, story) tencent_util.add_video(self.document, story) tencent_util.add_time(self.document, time) tencent_util.add_location(self.document, story)