def get_one_weibo(self, info): """获取一条微博的全部信息""" try: weibo = Weibo() weibo.user_id = self.user_id is_original = self.is_original(info) weibo.original = is_original # 是否原创微博 if (not self.filter) or is_original: weibo.weibo_id = info.xpath('@id')[0][2:] yield self.get_weibo_content(info, is_original, weibo) # 微博内容 weibo.article_url = self.get_article_url(info) # 头条文章url picture_urls = yield self.get_picture_urls( info, is_original, self.filter) weibo.pics = picture_urls['original_pictures'] # 原创图片url if not self.filter and not is_original: # 转发图片url weibo.retweet['pics'] = picture_urls['retweet_pictures'] weibo.video_url = self.get_video_url(info, is_original) # 微博视频url weibo.location = self.get_publish_place(info) # 微博发布位置 weibo.created_at = self.get_publish_time(info) # 微博发布时间 weibo.source = self.get_publish_tool(info) # 微博发布工具 footer = self.get_weibo_footer(info) weibo.attitudes_count = footer['up_num'] # 微博点赞数 weibo.reposts_count = footer['retweet_num'] # 转发数 weibo.comments_count = footer['comment_num'] # 评论数 else: weibo = None LOGGING.info(u'正在过滤转发微博') return weibo except Exception as e: raise HTMLParseException
def report_log(exception: Exception): """ 将错误报告给日志 :param exception: 产生的异常 """ LOGGING.warning('{} occur a exception {}:\n{}\n==========\n{}'.format( datetime.now(), exception.__class__.__name__, exception.args, traceback.format_exc()))
def handle_garbled(info): """处理乱码""" try: _info = (' '.join(info.xpath('.//text()')).replace( u'\u200b', '').encode(sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)) return _info except Exception as e: LOGGING.exception(e)
def extract_user_info(self): """提取用户信息""" user = USER_TEMPLATE.copy() nickname = self.selector.xpath('//title/text()')[0] nickname = nickname[:-3] # 检查cookie if nickname == u'登录 - 新' or nickname == u'新浪': LOGGING.warning(u'cookie错误或已过期') raise CookieInvalidException() user['nickname'] = nickname # 获取头像 try: user['head'] = self.selector.xpath( '//div[@class="c"]/img[@alt="头像"]')[0].get('src') except: user['head'] = '' # 获取基本信息 try: basic_info = self.selector.xpath("//div[@class='c'][3]/text()") zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] en_list = [ 'gender', 'location', 'birthday', 'description', 'verified_reason', 'talent' ] for i in basic_info: if i.split(':', 1)[0] in zh_list: user[en_list[zh_list.index(i.split(':', 1)[0])]] = i.split( ':', 1)[1].replace('\u3000', '') if self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u'学习经历': user['education'] = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') if self.selector.xpath( "//div[@class='tip'][3]/text()")[0] == u'工作经历': user['work'] = self.selector.xpath( "//div[@class='c'][5]/text()")[0][1:].replace( u'\xa0', u' ') elif self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u'工作经历': user['work'] = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') return user except Exception as e: utils.report_log(e) raise HTMLParseException
def extract_picture_urls(info, weibo_id): """提取微博原始图片url""" try: first_pic = '/mblog/pic/' + weibo_id all_pic = '/mblog/picAll/' + weibo_id picture_urls = list() a_list = info.xpath('div/a/@href') all_href = ''.join(a_list) if first_pic in all_href: # 检查是否有单张的缩略图 if all_pic in all_href: # 检查该条微博是否有多图 mblog_picall_curl_result = yield weibo_web_curl( SpiderAim.mblog_pic_all, weibo_id=weibo_id) mblogPicAllParser = None if not mblog_picall_curl_result['error_code']: mblogPicAllParser = MblogPicAllParser( mblog_picall_curl_result['response']) preview_picture_list = mblogPicAllParser.extract_preview_picture_list( ) picture_urls = [ p.replace('/thumb180/', '/large/') for p in preview_picture_list ] else: if info.xpath('.//img/@src'): for link in info.xpath('div/a'): if len(link.xpath('@href')) > 0: if first_pic in link.xpath('@href')[0]: if len(link.xpath('img/@src')) > 0: preview_picture = link.xpath( 'img/@src')[0] picture_urls = [ preview_picture.replace( '/wap180/', '/large/') ] break else: LOGGING.warning( u'爬虫微博可能被设置成了"不显示图片",请前往' u'"https://weibo.cn/account/customize/pic",修改为"显示"' ) sys.exit() return picture_urls except Exception as e: utils.report_log(e) return u'无'
def delete_one_proxy(self, seq_num): try: del self.accounts[seq_num] except IndexError: LOGGING.warning("delete fail because seq_num {} over the max account number {}." .format(seq_num, len(self.accounts)))
def update_one_proxy(self, seq_num, new_proxy): try: self.accounts[seq_num].proxy = new_proxy except IndexError: LOGGING.warning("update fail because seq_num {} over the max account number {}." .format(seq_num, len(self.accounts)))