Esempi in Python per handle_garbled, esempi in Python per utils.handle_garbled

Esempio n. 1

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

 def get_publish_time(info):
     """获取微博发布时间"""
     try:
         str_time = info.xpath("div/span[@class='ct']")
         str_time = utils.handle_garbled(str_time[0])
         publish_time = str_time.split(u'来自')[0]
         if u'刚刚' in publish_time:
             publish_time = datetime.now().strftime('%Y-%m-%d %H:%M')
         elif u'分钟' in publish_time:
             minute = publish_time[:publish_time.find(u'分钟')]
             minute = timedelta(minutes=int(minute))
             publish_time = (datetime.now() -
                             minute).strftime('%Y-%m-%d %H:%M')
         elif u'今天' in publish_time:
             today = datetime.now().strftime('%Y-%m-%d')
             time = publish_time[3:]
             publish_time = today + ' ' + time
             if len(publish_time) > 16:
                 publish_time = publish_time[:16]
         elif u'月' in publish_time:
             year = datetime.now().strftime('%Y')
             month = publish_time[0:2]
             day = publish_time[3:5]
             time = publish_time[7:12]
             publish_time = year + '-' + month + '-' + day + ' ' + time
         else:
             publish_time = publish_time[:16]
         return publish_time
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException

Esempio n. 2

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

    def get_original_weibo(self, info, weibo_id):
        """获取原创微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_weibo()
                if wb_content:
                    weibo_content = wb_content
            # 获取topics和at_users
            at_users, topics = PageParser.__get_atusers_and_topics(info)

            return weibo_content, topics, at_users
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException

Esempio n. 3

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

 def get_article_url(info):
     """获取微博头条文章的url"""
     article_url = ''
     text = utils.handle_garbled(info)
     if text.startswith(u'发布了头条文章'):
         url = info.xpath('.//a/@href')
         if url and url[0].startswith('https://weibo.cn/sinaurl'):
             article_url = url[0]
     return article_url

Esempio n. 4

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

    def _parse_one_comment(node):
        comment = CommentParser.COMMENT_TEMPLATE.copy()
        span_nodes = node.xpath('./span')
        for span_node in span_nodes:
            klass = span_node.get('class')
            if klass == 'kt':
                comment['is_hot'] = True
            elif klass == 'ctt':
                comment['content'] = ''.join(span_node.xpath('./text()'))
            elif klass == 'cc':
                text = ''.join(span_node.xpath('./a/text()'))
                pos = text.find('赞')
                if pos != -1:
                    comment['like_num'] = text[pos + 2:-1]
            elif klass == 'ct':
                str_time = utils.handle_garbled(span_node)
                # 获取发布工具
                if len(str_time.split(u'来自')) > 1:
                    publish_tool = str_time.split('来自')[1]
                else:
                    publish_tool = ''
                comment['publish_tool'] = publish_tool.strip()
                # 获取发布时间
                publish_time = str_time.split(u'来自')[0]
                if u'刚刚' in publish_time:
                    publish_time = datetime.now().strftime('%Y-%m-%d %H:%M')
                elif u'分钟' in publish_time:
                    minute = publish_time[:publish_time.find(u'分钟')]
                    minute = timedelta(minutes=int(minute))
                    publish_time = (datetime.now() -
                                    minute).strftime('%Y-%m-%d %H:%M')
                elif u'今天' in publish_time:
                    today = datetime.now().strftime('%Y-%m-%d')
                    time = publish_time[3:]
                    publish_time = today + ' ' + time
                    if len(publish_time) > 16:
                        publish_time = publish_time[:16]
                elif u'月' in publish_time:
                    year = datetime.now().strftime('%Y')
                    month = publish_time[0:2]
                    day = publish_time[3:5]
                    time = publish_time[7:12]
                    publish_time = year + '-' + month + '-' + day + ' ' + time
                else:
                    publish_time = publish_time[:16]
                comment['publish_time'] = publish_time

        user_node = node.xpath('./a')[0]
        comment['screen_name'] = user_node.xpath('./text()')[0]
        user_href = user_node.get('href')
        comment['user_id'] = user_href[user_href.rfind(r'/') + 1:]
        return comment

Esempio n. 5

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

 def get_publish_tool(info):
     """获取微博发布工具"""
     try:
         str_time = info.xpath("div/span[@class='ct']")
         str_time = utils.handle_garbled(str_time[0])
         if len(str_time.split(u'来自')) > 1:
             publish_tool = str_time.split(u'来自')[1]
         else:
             publish_tool = ''
         return publish_tool
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException

Esempio n. 6

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

    def get_long_weibo(self):
        """获取长原创微博"""

        try:
            for i in range(5):

                if self.selector is not None:
                    info = self.selector.xpath("//div[@id='M_']")[0]
                    wb_content = utils.handle_garbled(info)
                    wb_time = info.xpath("//span[@class='ct']/text()")[0]
                    weibo_content = wb_content[wb_content.find(':') +
                                               1:wb_content.rfind(wb_time)]
                    if weibo_content is not None:
                        return weibo_content
                sleep(random.randint(6, 10))
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException

Esempio n. 7

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

    def get_weibo_footer(info):
        """获取微博点赞数、转发数、评论数"""
        try:
            footer = {}
            pattern = r'\d+'
            str_footer = info.xpath('div')[-1]
            str_footer = utils.handle_garbled(str_footer)
            str_footer = str_footer[str_footer.rfind(u'赞'):]
            weibo_footer = re.findall(pattern, str_footer, re.M)

            up_num = int(weibo_footer[0])
            footer['up_num'] = up_num

            retweet_num = int(weibo_footer[1])
            footer['retweet_num'] = retweet_num

            comment_num = int(weibo_footer[2])
            footer['comment_num'] = comment_num
            return footer
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException

Esempio n. 8

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

 def get_publish_place(info):
     """获取微博发布位置"""
     try:
         div_first = info.xpath('div')[0]
         a_list = div_first.xpath('a')
         publish_place = ''
         for a in a_list:
             if ('place.weibo.com' in a.xpath('@href')[0]
                     and a.xpath('text()')[0] == u'显示地图'):
                 weibo_a = div_first.xpath("span[@class='ctt']/a")
                 if len(weibo_a) >= 1:
                     publish_place = weibo_a[-1]
                     if (u'视频' == div_first.xpath(
                             "span[@class='ctt']/a/text()")[-1][-2:]):
                         if len(weibo_a) >= 2:
                             publish_place = weibo_a[-2]
                         else:
                             publish_place = ''
                     publish_place = utils.handle_garbled(publish_place)
                     break
         return publish_place
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException

Esempio n. 9

0

Mostra file

File: page_parser.py Progetto: yubinCloud/weibo_crawler

    def get_retweet(self, info, weibo_id, weibo: Weibo):
        """获取转发微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[weibo_content.find(':') +
                                          1:weibo_content.rfind(u'赞')]
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            # 检查当前是否已经为全部微博内容
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_retweet(rev_type=dict)
                if wb_content:
                    weibo_content = wb_content

            # 提取转发理由
            if isinstance(weibo_content, dict):
                retweet_reason = weibo_content.get('retweet_reason')
                retweet_id = weibo_content.get('retweet_id')
                weibo_content = weibo_content.get('retweet')
            else:
                original_div = utils.handle_garbled(info.xpath('div')[-1])
                retweet_reason = original_div[original_div.find(':') +
                                              1:original_div.rindex(u'赞')]
                retweet_id = self.get_retweet_id(info)

            # 提取原始用户
            original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0]
            original_user = ''.join(original_user_node.xpath("./text()"))
            original_user_id = original_user_node.get('href')
            if original_user_id is not None:
                original_user_id = original_user_id[original_user_id.
                                                    rfind(r'/') + 1:]
            # 获取原始微博的footers
            original_footer_div = info.xpath(r'./div')[-2]

            footer_nodes = original_footer_div.xpath(
                r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:]
            original_like_num = 0
            original_retweet_num = 0
            original_comment_num = 0
            for i, footer_node in enumerate(footer_nodes):
                num = ''.join(footer_node.xpath('./text()'))
                try:
                    num = int(num[num.find('[') + 1:num.rfind(']')])
                except BaseException:
                    pass
                if i == 0:
                    original_like_num = num
                elif i == 1:
                    original_retweet_num = num
                elif i == 2:
                    original_comment_num = num

            # 获取话题
            original_div = info.xpath('./div')[0]
            retweet_div = info.xpath('./div')[-1]
            retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics(
                retweet_div)
            original_at_users, original_topics = PageParser.__get_atusers_and_topics(
                original_div)

            weibo.retweet['weibo_id'] = retweet_id
            weibo.retweet['user_id'] = original_user_id
            weibo.retweet['screen_name'] = original_user
            weibo.retweet['text'] = weibo_content
            weibo.retweet['topics'] = original_topics
            weibo.retweet['at_users'] = original_at_users
            weibo.retweet['attitudes_count'] = original_like_num
            weibo.retweet['comments_count'] = original_comment_num
            weibo.retweet['reposts_count'] = original_retweet_num
            weibo.topics = retweet_topics
            weibo.at_users = retweet_at_users
            weibo.text = retweet_reason

        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException