Exemple #1
0
    def get_items(self, stories):

        pattern = re.compile(
            r'<div class="msgBox".*?<div class="userName".*?title="(.*?)" gender='
            + '.*?<div class="msgCnt">(.*?)</div>' +
            '.*?<div class="pubInfo.*?from="\\d*">(.*?)</a>', re.S)

        for story in stories:
            story, quotation = tencent_util.depart_quotation(story)

            items = re.findall(pattern, story)
            if not len(items):
                continue

            item = items[0]
            print('作者:', item[0])
            self.document.add_heading('', 0)
            tencent_util.add_author(self.document, item[0])
            print('内容:', item[1])
            content_valid = ''.join(c for c in item[1]
                                    if ord(c) >= 32)  # delete control chars
            tencent_util.add_content(self.document, content_valid)
            tencent_util.add_quotation(self.document, quotation)
            tencent_util.add_picture(self.document, story)
            tencent_util.add_video(self.document, story)
            print('时间:', item[2])
            tencent_util.add_time(self.document, item[2])
            tencent_util.add_location(self.document, story)
Exemple #2
0
    def get_items(self):
        for story in self.stories:
            story, quotation = tencent_util.depart_quotation(
                self.browser, story)

            author = story.find_element_by_class_name(
                'userName').find_element_by_tag_name('a').get_attribute(
                    'title')
            print('作者:', author)
            content_html = story.find_element_by_class_name(
                'msgCnt').get_attribute('innerHTML')
            content_valid = ''.join(c for c in content_html
                                    if ord(c) >= 32)  # delete control chars
            content = story.find_element_by_class_name('msgCnt').text
            print('内容:', content)
            time = story.find_element_by_class_name('time').text
            print('时间:', time)
            print(
                '----------------------------------------------------------------------------------'
            )
            self.document.add_heading('', 0)
            tencent_util.add_author(self.document, author)
            tencent_util.add_content(self.document, content_valid)
            tencent_util.add_quotation(self.document, quotation)
            tencent_util.add_picture(self.document, story)
            tencent_util.add_video(self.document, story)
            tencent_util.add_time(self.document, time)
            tencent_util.add_location(self.document, story)
 def get_items(self):
     for story in self.stories:
         author = story.find_element_by_class_name(
             'userName').find_element_by_tag_name('a').get_attribute(
                 'title')
         #print story.find_element_by_class_name('userName').get_attribute('innerHTML')
         print '作者:', author
         content = story.find_element_by_class_name('msgCnt').text
         print '内容:', content
         time = story.find_element_by_class_name('time').text
         print '时间:', time
         print '----------------------------------------------------------------------------------'
         self.document.add_heading('', 0)
         tencent_util.add_author(self.document, author)
         tencent_util.add_content(self.document, content)
         tencent_util.add_picture(self.document, story)
         tencent_util.add_video(self.document, story)
         tencent_util.add_time(self.document, time)
         tencent_util.add_location(self.document, story)