Ejemplo n.º 1
0
 def get_publish_time(info):
     """获取微博发布时间"""
     try:
         str_time = info.xpath("div/span[@class='ct']")
         str_time = utils.handle_garbled(str_time[0])
         publish_time = str_time.split(u'来自')[0]
         if u'刚刚' in publish_time:
             publish_time = datetime.now().strftime('%Y-%m-%d %H:%M')
         elif u'分钟' in publish_time:
             minute = publish_time[:publish_time.find(u'分钟')]
             minute = timedelta(minutes=int(minute))
             publish_time = (datetime.now() -
                             minute).strftime('%Y-%m-%d %H:%M')
         elif u'今天' in publish_time:
             today = datetime.now().strftime('%Y-%m-%d')
             time = publish_time[3:]
             publish_time = today + ' ' + time
             if len(publish_time) > 16:
                 publish_time = publish_time[:16]
         elif u'月' in publish_time:
             year = datetime.now().strftime('%Y')
             month = publish_time[0:2]
             day = publish_time[3:5]
             time = publish_time[7:12]
             publish_time = year + '-' + month + '-' + day + ' ' + time
         else:
             publish_time = publish_time[:16]
         return publish_time
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 2
0
    def get_picture_urls(info, is_original, pic_filter=False, weibo_id=None):
        """获取微博原始图片url"""
        try:
            if weibo_id is None:
                weibo_id = info.xpath('@id')[0][2:]
            picture_urls = {}
            if is_original:
                original_pictures = yield PageParser.extract_picture_urls(
                    info, weibo_id)
                picture_urls['original_pictures'] = original_pictures
                if not pic_filter:
                    picture_urls['retweet_pictures'] = list()
            else:
                retweet_id = PageParser.get_retweet_id(info)

                retweet_pictures = yield PageParser.extract_picture_urls(
                    info, retweet_id)
                picture_urls['retweet_pictures'] = retweet_pictures
                a_list = info.xpath('div[last()]/a/@href')
                original_picture = ''
                for a in a_list:
                    if a.endswith(('.gif', '.jpeg', '.jpg', '.png')):
                        original_picture = a
                        break
                picture_urls['original_pictures'] = original_picture
            return picture_urls
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 3
0
 def get_one_page(self):
     """获取第page页的全部微博"""
     # 获取微博总页数
     if not self.selector.xpath("//input[@name='mp']"):
         max_page = 1
     else:
         max_page = int(
             self.selector.xpath("//input[@name='mp']")[0].attrib['value'])
     weibo_id_list = list()  # 存储微博的id
     weibos = list()  # 存储所有微博的信息
     try:
         all_weibo_info = self.selector.xpath("//div[@class='c']")
         is_exist = all_weibo_info[0].xpath("div/span[@class='ctt']")
         if is_exist:
             for i in range(0, len(all_weibo_info) - 2):
                 try:
                     weibo = yield self.get_one_weibo(all_weibo_info[i])
                 except HTMLParseException:
                     continue
                 if weibo:
                     if weibo.weibo_id in weibo_id_list:
                         continue
                     weibos.append(weibo)
                     weibo_id_list.append(weibo.weibo_id)
         return weibos, max_page
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 4
0
    def get_original_weibo(self, info, weibo_id):
        """获取原创微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_weibo()
                if wb_content:
                    weibo_content = wb_content
            # 获取topics和at_users
            at_users, topics = PageParser.__get_atusers_and_topics(info)

            return weibo_content, topics, at_users
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
 def parse_page(self):
     """解析网页"""
     try:
         user_list = self._get_all_user()
         return user_list
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 6
0
    def get(self):
        args_dict = self.args2dict()
        user_id = args_dict.get('user_id')
        if user_id is None:  # 此时URL缺少查询参数
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)
            return

        try:
            # 爬取主页的结果
            idx_curl_result = yield weibo_web_curl(SpiderAim.users_show,
                                                   user_id=user_id)
            if not idx_curl_result['error_code']:
                idxParser = IndexParser(
                    user_id, idx_curl_result.get('response'))  # 构建一个主页解析器

                try:
                    user_id = idxParser.get_user_id()  # 获取到真正的user_id
                    max_page_num = idxParser.get_page_num()  # 获取微博的页数
                except CookieInvalidException:
                    self.write(WeiboCurlError.COOKIE_INVALID)
                    return

                # 爬取信息页的结果
                info_curl_result = yield weibo_web_curl(SpiderAim.users_info,
                                                        user_id=user_id)
                if not info_curl_result['error_code']:
                    infoParser = InfoParser(
                        info_curl_result.get('response'))  # 信息页解析器
                    user_info = infoParser.extract_user_info()
                    user = idxParser.get_user(user_info)
                    user['max_page'] = max_page_num  # 微博的最大页数
                    # print(user)

                    success = settings.SUCCESS.copy()
                    try:
                        success['data'] = {'result': user, 'cursor': ''}
                    except AttributeError:  # user没有__dict__属性时,说明未爬取到user
                        self.write(WeiboCurlError.REQUEST_ARGS_ERROR)  # 报告参数错误
                        return
                    self.write(success)
                    return
                else:
                    error_res = curl_result_to_api_result(info_curl_result)
                    self.write(error_res)
                    return
            else:
                error_res = curl_result_to_api_result(idx_curl_result)
                self.write(error_res)
                return

        except HTMLParseException:
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return
        except Exception as e:
            report_log(e)
            self.write(WeiboCurlError.UNKNOWN_ERROR)
            return
Ejemplo n.º 7
0
 def get_user(self, user_info):
     """获取用户信息、微博数、关注数、粉丝数"""
     self.user = user_info
     try:
         user_info = self.selector.xpath("//div[@class='tip2']/*/text()")
         self.user['id'] = self.user_id
         self.user['weibo_num'] = int(user_info[0][3:-1])
         self.user['following'] = int(user_info[1][3:-1])
         self.user['followers'] = int(user_info[2][3:-1])
         return self.user
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 8
0
 def get_publish_tool(info):
     """获取微博发布工具"""
     try:
         str_time = info.xpath("div/span[@class='ct']")
         str_time = utils.handle_garbled(str_time[0])
         if len(str_time.split(u'来自')) > 1:
             publish_tool = str_time.split(u'来自')[1]
         else:
             publish_tool = ''
         return publish_tool
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 9
0
 def get_page_num(self):
     """获取微博总页数"""
     try:
         if not self.selector.xpath("//input[@name='mp']"):
             page_num = 1
         else:
             page_num = int(
                 self.selector.xpath("//input[@name='mp']")
                 [0].attrib['value'])
         return page_num
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 10
0
    def extract_user_info(self):
        """提取用户信息"""
        user = USER_TEMPLATE.copy()
        nickname = self.selector.xpath('//title/text()')[0]
        nickname = nickname[:-3]
        # 检查cookie
        if nickname == u'登录 - 新' or nickname == u'新浪':
            LOGGING.warning(u'cookie错误或已过期')
            raise CookieInvalidException()

        user['nickname'] = nickname
        # 获取头像
        try:
            user['head'] = self.selector.xpath(
                '//div[@class="c"]/img[@alt="头像"]')[0].get('src')
        except:
            user['head'] = ''
        # 获取基本信息
        try:
            basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
            zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人']
            en_list = [
                'gender', 'location', 'birthday', 'description',
                'verified_reason', 'talent'
            ]
            for i in basic_info:
                if i.split(':', 1)[0] in zh_list:
                    user[en_list[zh_list.index(i.split(':', 1)[0])]] = i.split(
                        ':', 1)[1].replace('\u3000', '')

            if self.selector.xpath(
                    "//div[@class='tip'][2]/text()")[0] == u'学习经历':
                user['education'] = self.selector.xpath(
                    "//div[@class='c'][4]/text()")[0][1:].replace(
                        u'\xa0', u' ')
                if self.selector.xpath(
                        "//div[@class='tip'][3]/text()")[0] == u'工作经历':
                    user['work'] = self.selector.xpath(
                        "//div[@class='c'][5]/text()")[0][1:].replace(
                            u'\xa0', u' ')
            elif self.selector.xpath(
                    "//div[@class='tip'][2]/text()")[0] == u'工作经历':
                user['work'] = self.selector.xpath(
                    "//div[@class='c'][4]/text()")[0][1:].replace(
                        u'\xa0', u' ')
            return user
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 11
0
 def get_all_comment(self):
     """获取评论"""
     comment_list = list()
     all_div = self.selector.xpath('/html/body/div[@class="c"]')
     for div in all_div:
         id_value = div.get('id')
         if id_value is not None and id_value.find('C_') != -1:
             try:
                 comment = CommentParser._parse_one_comment(div)
             except Exception as e:
                 utils.report_log(e)
                 comment = None
             if comment is not None:
                 comment_list.append(comment)
     return comment_list
Ejemplo n.º 12
0
 def parse_page(self):
     """
     解析网页
     """
     # 检查页面是否为空
     check_empty = self.selector.xpath(
         '//div[@class="card card-no-result s-pt20b40"]')
     if len(check_empty) != 0:
         return None
     try:
         weibo_list = self._get_all_weibo()
         return weibo_list
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 13
0
 def get(self):
     # 获取查询参数
     args_dict = self.args2dict()
     user_id, cursor = args_dict.get('user_id'), args_dict.get(
         'cursor', '1')
     if user_id is None:
         self.write(WeiboCurlError.REQUEST_LACK_ARGS)
         return
     try:
         cursor = 1 if not cursor else int(cursor)
     except ValueError:
         self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
         return
     # 进行爬取
     follow_curl_result = yield weibo_web_curl(SpiderAim.follow,
                                               user_id=user_id,
                                               page_num=cursor)
     if not follow_curl_result['error_code']:
         self.response = follow_curl_result['response']
     else:
         error_res = curl_result_to_api_result(follow_curl_result)
         self.write(error_res)
         return
     # 构建解析器
     followParser = FollowParser(self.response)
     # 提取相关信息并返回结果
     try:
         follow_list = followParser.get_follows()  # 关注者的列表
         max_page_num = followParser.get_max_page_num()  # 总页数
         if cursor < max_page_num:
             cursor = str(cursor + 1)
         success = settings.SUCCESS.copy()
         success['data'] = {
             'result': {
                 'friend_list': follow_list,
                 'max_page_num': max_page_num
             },
             'cursor': cursor
         }
         # print(success)
         self.write(success)
         return
     except HTMLParseException:
         self.write(WeiboCurlError.HTML_PARSE_ERROR)
         return
     except Exception as e:
         report_log(e)
         self.write(WeiboCurlError.UNKNOWN_ERROR)
Ejemplo n.º 14
0
    def extract_picture_urls(info, weibo_id):
        """提取微博原始图片url"""
        try:
            first_pic = '/mblog/pic/' + weibo_id
            all_pic = '/mblog/picAll/' + weibo_id
            picture_urls = list()
            a_list = info.xpath('div/a/@href')
            all_href = ''.join(a_list)
            if first_pic in all_href:  # 检查是否有单张的缩略图
                if all_pic in all_href:  # 检查该条微博是否有多图
                    mblog_picall_curl_result = yield weibo_web_curl(
                        SpiderAim.mblog_pic_all, weibo_id=weibo_id)
                    mblogPicAllParser = None
                    if not mblog_picall_curl_result['error_code']:
                        mblogPicAllParser = MblogPicAllParser(
                            mblog_picall_curl_result['response'])

                    preview_picture_list = mblogPicAllParser.extract_preview_picture_list(
                    )
                    picture_urls = [
                        p.replace('/thumb180/', '/large/')
                        for p in preview_picture_list
                    ]
                else:
                    if info.xpath('.//img/@src'):
                        for link in info.xpath('div/a'):
                            if len(link.xpath('@href')) > 0:
                                if first_pic in link.xpath('@href')[0]:
                                    if len(link.xpath('img/@src')) > 0:
                                        preview_picture = link.xpath(
                                            'img/@src')[0]
                                        picture_urls = [
                                            preview_picture.replace(
                                                '/wap180/', '/large/')
                                        ]
                                        break
                    else:
                        LOGGING.warning(
                            u'爬虫微博可能被设置成了"不显示图片",请前往'
                            u'"https://weibo.cn/account/customize/pic",修改为"显示"'
                        )
                        sys.exit()
            return picture_urls
        except Exception as e:
            utils.report_log(e)
            return u'无'
Ejemplo n.º 15
0
    def get_long_retweet(self, rev_type=str):
        """获取长转发微博"""
        try:
            wb_content = self.get_long_weibo()
            retweet_content = wb_content[:wb_content.find(u'原文转发')]  # 转发内容的原文
            retweet_reason = wb_content[wb_content.find(u'转发理由:') + 5:]  # 转发理由

            if rev_type is dict:
                return {
                    'retweet': retweet_content,
                    'retweet_reason': retweet_reason,
                    'retweet_id': PageParser.get_retweet_id(self.info_node)
                }
            return '转发原文:{}\n转发理由:{}'.format(retweet_content, retweet_reason)

        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 16
0
    def get_long_weibo(self):
        """获取长原创微博"""

        try:
            for i in range(5):

                if self.selector is not None:
                    info = self.selector.xpath("//div[@id='M_']")[0]
                    wb_content = utils.handle_garbled(info)
                    wb_time = info.xpath("//span[@class='ct']/text()")[0]
                    weibo_content = wb_content[wb_content.find(':') +
                                               1:wb_content.rfind(wb_time)]
                    if weibo_content is not None:
                        return weibo_content
                sleep(random.randint(6, 10))
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 17
0
    def get_weibo_footer(info):
        """获取微博点赞数、转发数、评论数"""
        try:
            footer = {}
            pattern = r'\d+'
            str_footer = info.xpath('div')[-1]
            str_footer = utils.handle_garbled(str_footer)
            str_footer = str_footer[str_footer.rfind(u'赞'):]
            weibo_footer = re.findall(pattern, str_footer, re.M)

            up_num = int(weibo_footer[0])
            footer['up_num'] = up_num

            retweet_num = int(weibo_footer[1])
            footer['retweet_num'] = retweet_num

            comment_num = int(weibo_footer[2])
            footer['comment_num'] = comment_num
            return footer
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 18
0
 def get_publish_place(info):
     """获取微博发布位置"""
     try:
         div_first = info.xpath('div')[0]
         a_list = div_first.xpath('a')
         publish_place = ''
         for a in a_list:
             if ('place.weibo.com' in a.xpath('@href')[0]
                     and a.xpath('text()')[0] == u'显示地图'):
                 weibo_a = div_first.xpath("span[@class='ctt']/a")
                 if len(weibo_a) >= 1:
                     publish_place = weibo_a[-1]
                     if (u'视频' == div_first.xpath(
                             "span[@class='ctt']/a/text()")[-1][-2:]):
                         if len(weibo_a) >= 2:
                             publish_place = weibo_a[-2]
                         else:
                             publish_place = ''
                     publish_place = utils.handle_garbled(publish_place)
                     break
         return publish_place
     except Exception as e:
         utils.report_log(e)
         raise HTMLParseException
Ejemplo n.º 19
0
    def get(self):
        # 获取参数
        args_dict = self.args2dict()
        weibo_id = args_dict.get('weibo_id')
        if weibo_id is None:
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)
            return
        hot = args_dict.get('hot', False)  # 是否获取热评
        cursor = args_dict.get('cursor', '1')
        try:
            cursor = 1 if not cursor else int(cursor)
        except ValueError:
            self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
            return
        if cursor > SEARCH_LIMIT_PAGES:
            results = settings.SUCCESS.copy()
            results['data'] = {'result': [], 'cursor': '0'}
            self.write(results)
            return
        # 进行爬取
        comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment,
                                                   weibo_id=weibo_id,
                                                   page_num=cursor)
        if not comment_curl_result['error_code']:
            self.response = comment_curl_result['response']
        else:
            error_res = curl_result_to_api_result(comment_curl_result)
            self.write(error_res)
            return
        # 构建解析器
        try:
            commonParser = CommentParser(weibo_id, response=self.response)
        except CookieInvalidException:
            self.write(WeiboCurlError.COOKIE_INVALID)
            return

        try:
            weibo_detail = yield commonParser.parse_one_weibo()
        except HTMLParseException as e:
            report_log(e)
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return
        except Exception as e:
            report_log(e)
            self.write(WeiboCurlError.UNKNOWN_ERROR)
            return

        # 根据 hot 参数来确定获取 comment_list 的方式
        if not hot:
            comment_list = commonParser.get_all_comment()
        else:
            hot_comment_curl_result = yield weibo_web_curl(
                SpiderAim.hot_comment, weibo_id=weibo_id, page_num=cursor)
            if not hot_comment_curl_result['error_code']:
                self.hot_comment_response = hot_comment_curl_result['response']
            else:
                error_res = curl_result_to_api_result(comment_curl_result)
                self.write(error_res)
                return

            try:
                comment_list = HotCommentParser(
                    weibo_id, self.hot_comment_response).get_all_comment()
            except HTMLParseException:
                self.write(WeiboCurlError.HTML_PARSE_ERROR)
                return
            except Exception as e:
                report_log(
                    (__class__.__name__, StatusesShowHandler.get.__name__), e)
                self.write(WeiboCurlError.UNKNOWN_ERROR)
                return
        # 成功时返回结果
        weibo_detail['weibo_id'] = weibo_id
        weibo_detail['comments'] = comment_list
        success = settings.SUCCESS.copy()
        success['data'] = {
            'result': weibo_detail,
            'cursor':
            str(cursor + 1) if cursor < weibo_detail['max_page'] else '0'
        }
        # print(success)
        self.write(success)
        return
Ejemplo n.º 20
0
    def get_retweet(self, info, weibo_id, weibo: Weibo):
        """获取转发微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[weibo_content.find(':') +
                                          1:weibo_content.rfind(u'赞')]
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            # 检查当前是否已经为全部微博内容
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_retweet(rev_type=dict)
                if wb_content:
                    weibo_content = wb_content

            # 提取转发理由
            if isinstance(weibo_content, dict):
                retweet_reason = weibo_content.get('retweet_reason')
                retweet_id = weibo_content.get('retweet_id')
                weibo_content = weibo_content.get('retweet')
            else:
                original_div = utils.handle_garbled(info.xpath('div')[-1])
                retweet_reason = original_div[original_div.find(':') +
                                              1:original_div.rindex(u'赞')]
                retweet_id = self.get_retweet_id(info)

            # 提取原始用户
            original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0]
            original_user = ''.join(original_user_node.xpath("./text()"))
            original_user_id = original_user_node.get('href')
            if original_user_id is not None:
                original_user_id = original_user_id[original_user_id.
                                                    rfind(r'/') + 1:]
            # 获取原始微博的footers
            original_footer_div = info.xpath(r'./div')[-2]

            footer_nodes = original_footer_div.xpath(
                r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:]
            original_like_num = 0
            original_retweet_num = 0
            original_comment_num = 0
            for i, footer_node in enumerate(footer_nodes):
                num = ''.join(footer_node.xpath('./text()'))
                try:
                    num = int(num[num.find('[') + 1:num.rfind(']')])
                except BaseException:
                    pass
                if i == 0:
                    original_like_num = num
                elif i == 1:
                    original_retweet_num = num
                elif i == 2:
                    original_comment_num = num

            # 获取话题
            original_div = info.xpath('./div')[0]
            retweet_div = info.xpath('./div')[-1]
            retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics(
                retweet_div)
            original_at_users, original_topics = PageParser.__get_atusers_and_topics(
                original_div)

            weibo.retweet['weibo_id'] = retweet_id
            weibo.retweet['user_id'] = original_user_id
            weibo.retweet['screen_name'] = original_user
            weibo.retweet['text'] = weibo_content
            weibo.retweet['topics'] = original_topics
            weibo.retweet['at_users'] = original_at_users
            weibo.retweet['attitudes_count'] = original_like_num
            weibo.retweet['comments_count'] = original_comment_num
            weibo.retweet['reposts_count'] = original_retweet_num
            weibo.topics = retweet_topics
            weibo.at_users = retweet_at_users
            weibo.text = retweet_reason

        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException