Ejemplo n.º 1
0
    def get_one_weibo(self, info):
        """获取一条微博的全部信息"""
        try:
            weibo = Weibo()
            weibo.user_id = self.user_id
            is_original = self.is_original(info)
            weibo.original = is_original  # 是否原创微博
            if (not self.filter) or is_original:
                weibo.weibo_id = info.xpath('@id')[0][2:]
                yield self.get_weibo_content(info, is_original, weibo)  # 微博内容

                weibo.article_url = self.get_article_url(info)  # 头条文章url
                picture_urls = yield self.get_picture_urls(
                    info, is_original, self.filter)
                weibo.pics = picture_urls['original_pictures']  # 原创图片url
                if not self.filter and not is_original:
                    # 转发图片url
                    weibo.retweet['pics'] = picture_urls['retweet_pictures']

                weibo.video_url = self.get_video_url(info,
                                                     is_original)  # 微博视频url
                weibo.location = self.get_publish_place(info)  # 微博发布位置
                weibo.created_at = self.get_publish_time(info)  # 微博发布时间
                weibo.source = self.get_publish_tool(info)  # 微博发布工具
                footer = self.get_weibo_footer(info)
                weibo.attitudes_count = footer['up_num']  # 微博点赞数
                weibo.reposts_count = footer['retweet_num']  # 转发数
                weibo.comments_count = footer['comment_num']  # 评论数
            else:
                weibo = None
                LOGGING.info(u'正在过滤转发微博')
            return weibo
        except Exception as e:
            raise HTMLParseException
Ejemplo n.º 2
0
def report_log(exception: Exception):
    """
    将错误报告给日志
    :param exception: 产生的异常
    """
    LOGGING.warning('{} occur a exception {}:\n{}\n==========\n{}'.format(
        datetime.now(), exception.__class__.__name__, exception.args,
        traceback.format_exc()))
Ejemplo n.º 3
0
def handle_garbled(info):
    """处理乱码"""
    try:
        _info = (' '.join(info.xpath('.//text()')).replace(
            u'\u200b', '').encode(sys.stdout.encoding,
                                  'ignore').decode(sys.stdout.encoding))
        return _info
    except Exception as e:
        LOGGING.exception(e)
Ejemplo n.º 4
0
    def extract_user_info(self):
        """提取用户信息"""
        user = USER_TEMPLATE.copy()
        nickname = self.selector.xpath('//title/text()')[0]
        nickname = nickname[:-3]
        # 检查cookie
        if nickname == u'登录 - 新' or nickname == u'新浪':
            LOGGING.warning(u'cookie错误或已过期')
            raise CookieInvalidException()

        user['nickname'] = nickname
        # 获取头像
        try:
            user['head'] = self.selector.xpath(
                '//div[@class="c"]/img[@alt="头像"]')[0].get('src')
        except:
            user['head'] = ''
        # 获取基本信息
        try:
            basic_info = self.selector.xpath("//div[@class='c'][3]/text()")
            zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人']
            en_list = [
                'gender', 'location', 'birthday', 'description',
                'verified_reason', 'talent'
            ]
            for i in basic_info:
                if i.split(':', 1)[0] in zh_list:
                    user[en_list[zh_list.index(i.split(':', 1)[0])]] = i.split(
                        ':', 1)[1].replace('\u3000', '')

            if self.selector.xpath(
                    "//div[@class='tip'][2]/text()")[0] == u'学习经历':
                user['education'] = self.selector.xpath(
                    "//div[@class='c'][4]/text()")[0][1:].replace(
                        u'\xa0', u' ')
                if self.selector.xpath(
                        "//div[@class='tip'][3]/text()")[0] == u'工作经历':
                    user['work'] = self.selector.xpath(
                        "//div[@class='c'][5]/text()")[0][1:].replace(
                            u'\xa0', u' ')
            elif self.selector.xpath(
                    "//div[@class='tip'][2]/text()")[0] == u'工作经历':
                user['work'] = self.selector.xpath(
                    "//div[@class='c'][4]/text()")[0][1:].replace(
                        u'\xa0', u' ')
            return user
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
Ejemplo n.º 5
0
    def extract_picture_urls(info, weibo_id):
        """提取微博原始图片url"""
        try:
            first_pic = '/mblog/pic/' + weibo_id
            all_pic = '/mblog/picAll/' + weibo_id
            picture_urls = list()
            a_list = info.xpath('div/a/@href')
            all_href = ''.join(a_list)
            if first_pic in all_href:  # 检查是否有单张的缩略图
                if all_pic in all_href:  # 检查该条微博是否有多图
                    mblog_picall_curl_result = yield weibo_web_curl(
                        SpiderAim.mblog_pic_all, weibo_id=weibo_id)
                    mblogPicAllParser = None
                    if not mblog_picall_curl_result['error_code']:
                        mblogPicAllParser = MblogPicAllParser(
                            mblog_picall_curl_result['response'])

                    preview_picture_list = mblogPicAllParser.extract_preview_picture_list(
                    )
                    picture_urls = [
                        p.replace('/thumb180/', '/large/')
                        for p in preview_picture_list
                    ]
                else:
                    if info.xpath('.//img/@src'):
                        for link in info.xpath('div/a'):
                            if len(link.xpath('@href')) > 0:
                                if first_pic in link.xpath('@href')[0]:
                                    if len(link.xpath('img/@src')) > 0:
                                        preview_picture = link.xpath(
                                            'img/@src')[0]
                                        picture_urls = [
                                            preview_picture.replace(
                                                '/wap180/', '/large/')
                                        ]
                                        break
                    else:
                        LOGGING.warning(
                            u'爬虫微博可能被设置成了"不显示图片",请前往'
                            u'"https://weibo.cn/account/customize/pic",修改为"显示"'
                        )
                        sys.exit()
            return picture_urls
        except Exception as e:
            utils.report_log(e)
            return u'无'
Ejemplo n.º 6
0
 def delete_one_proxy(self, seq_num):
     try:
         del self.accounts[seq_num]
     except IndexError:
         LOGGING.warning("delete fail because seq_num {} over the max account number {}."
                         .format(seq_num, len(self.accounts)))
Ejemplo n.º 7
0
 def update_one_proxy(self, seq_num, new_proxy):
     try:
         self.accounts[seq_num].proxy = new_proxy
     except IndexError:
         LOGGING.warning("update fail because seq_num {} over the max account number {}."
                         .format(seq_num, len(self.accounts)))