def get_publish_time(info): """获取微博发布时间""" try: str_time = info.xpath("div/span[@class='ct']") str_time = utils.handle_garbled(str_time[0]) publish_time = str_time.split(u'来自')[0] if u'刚刚' in publish_time: publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') elif u'分钟' in publish_time: minute = publish_time[:publish_time.find(u'分钟')] minute = timedelta(minutes=int(minute)) publish_time = (datetime.now() - minute).strftime('%Y-%m-%d %H:%M') elif u'今天' in publish_time: today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] publish_time = today + ' ' + time if len(publish_time) > 16: publish_time = publish_time[:16] elif u'月' in publish_time: year = datetime.now().strftime('%Y') month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] return publish_time except Exception as e: utils.report_log(e) raise HTMLParseException
def get_picture_urls(info, is_original, pic_filter=False, weibo_id=None): """获取微博原始图片url""" try: if weibo_id is None: weibo_id = info.xpath('@id')[0][2:] picture_urls = {} if is_original: original_pictures = yield PageParser.extract_picture_urls( info, weibo_id) picture_urls['original_pictures'] = original_pictures if not pic_filter: picture_urls['retweet_pictures'] = list() else: retweet_id = PageParser.get_retweet_id(info) retweet_pictures = yield PageParser.extract_picture_urls( info, retweet_id) picture_urls['retweet_pictures'] = retweet_pictures a_list = info.xpath('div[last()]/a/@href') original_picture = '' for a in a_list: if a.endswith(('.gif', '.jpeg', '.jpg', '.png')): original_picture = a break picture_urls['original_pictures'] = original_picture return picture_urls except Exception as e: utils.report_log(e) raise HTMLParseException
def get_one_page(self): """获取第page页的全部微博""" # 获取微博总页数 if not self.selector.xpath("//input[@name='mp']"): max_page = 1 else: max_page = int( self.selector.xpath("//input[@name='mp']")[0].attrib['value']) weibo_id_list = list() # 存储微博的id weibos = list() # 存储所有微博的信息 try: all_weibo_info = self.selector.xpath("//div[@class='c']") is_exist = all_weibo_info[0].xpath("div/span[@class='ctt']") if is_exist: for i in range(0, len(all_weibo_info) - 2): try: weibo = yield self.get_one_weibo(all_weibo_info[i]) except HTMLParseException: continue if weibo: if weibo.weibo_id in weibo_id_list: continue weibos.append(weibo) weibo_id_list.append(weibo.weibo_id) return weibos, max_page except Exception as e: utils.report_log(e) raise HTMLParseException
def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u'赞')] a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_weibo() if wb_content: weibo_content = wb_content # 获取topics和at_users at_users, topics = PageParser.__get_atusers_and_topics(info) return weibo_content, topics, at_users except Exception as e: utils.report_log(e) raise HTMLParseException
def parse_page(self): """解析网页""" try: user_list = self._get_all_user() return user_list except Exception as e: utils.report_log(e) raise HTMLParseException
def get(self): args_dict = self.args2dict() user_id = args_dict.get('user_id') if user_id is None: # 此时URL缺少查询参数 self.write(WeiboCurlError.REQUEST_LACK_ARGS) return try: # 爬取主页的结果 idx_curl_result = yield weibo_web_curl(SpiderAim.users_show, user_id=user_id) if not idx_curl_result['error_code']: idxParser = IndexParser( user_id, idx_curl_result.get('response')) # 构建一个主页解析器 try: user_id = idxParser.get_user_id() # 获取到真正的user_id max_page_num = idxParser.get_page_num() # 获取微博的页数 except CookieInvalidException: self.write(WeiboCurlError.COOKIE_INVALID) return # 爬取信息页的结果 info_curl_result = yield weibo_web_curl(SpiderAim.users_info, user_id=user_id) if not info_curl_result['error_code']: infoParser = InfoParser( info_curl_result.get('response')) # 信息页解析器 user_info = infoParser.extract_user_info() user = idxParser.get_user(user_info) user['max_page'] = max_page_num # 微博的最大页数 # print(user) success = settings.SUCCESS.copy() try: success['data'] = {'result': user, 'cursor': ''} except AttributeError: # user没有__dict__属性时,说明未爬取到user self.write(WeiboCurlError.REQUEST_ARGS_ERROR) # 报告参数错误 return self.write(success) return else: error_res = curl_result_to_api_result(info_curl_result) self.write(error_res) return else: error_res = curl_result_to_api_result(idx_curl_result) self.write(error_res) return except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR) return
def get_user(self, user_info): """获取用户信息、微博数、关注数、粉丝数""" self.user = user_info try: user_info = self.selector.xpath("//div[@class='tip2']/*/text()") self.user['id'] = self.user_id self.user['weibo_num'] = int(user_info[0][3:-1]) self.user['following'] = int(user_info[1][3:-1]) self.user['followers'] = int(user_info[2][3:-1]) return self.user except Exception as e: utils.report_log(e) raise HTMLParseException
def get_publish_tool(info): """获取微博发布工具""" try: str_time = info.xpath("div/span[@class='ct']") str_time = utils.handle_garbled(str_time[0]) if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split(u'来自')[1] else: publish_tool = '' return publish_tool except Exception as e: utils.report_log(e) raise HTMLParseException
def get_page_num(self): """获取微博总页数""" try: if not self.selector.xpath("//input[@name='mp']"): page_num = 1 else: page_num = int( self.selector.xpath("//input[@name='mp']") [0].attrib['value']) return page_num except Exception as e: utils.report_log(e) raise HTMLParseException
def extract_user_info(self): """提取用户信息""" user = USER_TEMPLATE.copy() nickname = self.selector.xpath('//title/text()')[0] nickname = nickname[:-3] # 检查cookie if nickname == u'登录 - 新' or nickname == u'新浪': LOGGING.warning(u'cookie错误或已过期') raise CookieInvalidException() user['nickname'] = nickname # 获取头像 try: user['head'] = self.selector.xpath( '//div[@class="c"]/img[@alt="头像"]')[0].get('src') except: user['head'] = '' # 获取基本信息 try: basic_info = self.selector.xpath("//div[@class='c'][3]/text()") zh_list = [u'性别', u'地区', u'生日', u'简介', u'认证', u'达人'] en_list = [ 'gender', 'location', 'birthday', 'description', 'verified_reason', 'talent' ] for i in basic_info: if i.split(':', 1)[0] in zh_list: user[en_list[zh_list.index(i.split(':', 1)[0])]] = i.split( ':', 1)[1].replace('\u3000', '') if self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u'学习经历': user['education'] = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') if self.selector.xpath( "//div[@class='tip'][3]/text()")[0] == u'工作经历': user['work'] = self.selector.xpath( "//div[@class='c'][5]/text()")[0][1:].replace( u'\xa0', u' ') elif self.selector.xpath( "//div[@class='tip'][2]/text()")[0] == u'工作经历': user['work'] = self.selector.xpath( "//div[@class='c'][4]/text()")[0][1:].replace( u'\xa0', u' ') return user except Exception as e: utils.report_log(e) raise HTMLParseException
def get_all_comment(self): """获取评论""" comment_list = list() all_div = self.selector.xpath('/html/body/div[@class="c"]') for div in all_div: id_value = div.get('id') if id_value is not None and id_value.find('C_') != -1: try: comment = CommentParser._parse_one_comment(div) except Exception as e: utils.report_log(e) comment = None if comment is not None: comment_list.append(comment) return comment_list
def parse_page(self): """ 解析网页 """ # 检查页面是否为空 check_empty = self.selector.xpath( '//div[@class="card card-no-result s-pt20b40"]') if len(check_empty) != 0: return None try: weibo_list = self._get_all_weibo() return weibo_list except Exception as e: utils.report_log(e) raise HTMLParseException
def get(self): # 获取查询参数 args_dict = self.args2dict() user_id, cursor = args_dict.get('user_id'), args_dict.get( 'cursor', '1') if user_id is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) return try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return # 进行爬取 follow_curl_result = yield weibo_web_curl(SpiderAim.follow, user_id=user_id, page_num=cursor) if not follow_curl_result['error_code']: self.response = follow_curl_result['response'] else: error_res = curl_result_to_api_result(follow_curl_result) self.write(error_res) return # 构建解析器 followParser = FollowParser(self.response) # 提取相关信息并返回结果 try: follow_list = followParser.get_follows() # 关注者的列表 max_page_num = followParser.get_max_page_num() # 总页数 if cursor < max_page_num: cursor = str(cursor + 1) success = settings.SUCCESS.copy() success['data'] = { 'result': { 'friend_list': follow_list, 'max_page_num': max_page_num }, 'cursor': cursor } # print(success) self.write(success) return except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR)
def extract_picture_urls(info, weibo_id): """提取微博原始图片url""" try: first_pic = '/mblog/pic/' + weibo_id all_pic = '/mblog/picAll/' + weibo_id picture_urls = list() a_list = info.xpath('div/a/@href') all_href = ''.join(a_list) if first_pic in all_href: # 检查是否有单张的缩略图 if all_pic in all_href: # 检查该条微博是否有多图 mblog_picall_curl_result = yield weibo_web_curl( SpiderAim.mblog_pic_all, weibo_id=weibo_id) mblogPicAllParser = None if not mblog_picall_curl_result['error_code']: mblogPicAllParser = MblogPicAllParser( mblog_picall_curl_result['response']) preview_picture_list = mblogPicAllParser.extract_preview_picture_list( ) picture_urls = [ p.replace('/thumb180/', '/large/') for p in preview_picture_list ] else: if info.xpath('.//img/@src'): for link in info.xpath('div/a'): if len(link.xpath('@href')) > 0: if first_pic in link.xpath('@href')[0]: if len(link.xpath('img/@src')) > 0: preview_picture = link.xpath( 'img/@src')[0] picture_urls = [ preview_picture.replace( '/wap180/', '/large/') ] break else: LOGGING.warning( u'爬虫微博可能被设置成了"不显示图片",请前往' u'"https://weibo.cn/account/customize/pic",修改为"显示"' ) sys.exit() return picture_urls except Exception as e: utils.report_log(e) return u'无'
def get_long_retweet(self, rev_type=str): """获取长转发微博""" try: wb_content = self.get_long_weibo() retweet_content = wb_content[:wb_content.find(u'原文转发')] # 转发内容的原文 retweet_reason = wb_content[wb_content.find(u'转发理由:') + 5:] # 转发理由 if rev_type is dict: return { 'retweet': retweet_content, 'retweet_reason': retweet_reason, 'retweet_id': PageParser.get_retweet_id(self.info_node) } return '转发原文:{}\n转发理由:{}'.format(retweet_content, retweet_reason) except Exception as e: utils.report_log(e) raise HTMLParseException
def get_long_weibo(self): """获取长原创微博""" try: for i in range(5): if self.selector is not None: info = self.selector.xpath("//div[@id='M_']")[0] wb_content = utils.handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] weibo_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(wb_time)] if weibo_content is not None: return weibo_content sleep(random.randint(6, 10)) except Exception as e: utils.report_log(e) raise HTMLParseException
def get_weibo_footer(info): """获取微博点赞数、转发数、评论数""" try: footer = {} pattern = r'\d+' str_footer = info.xpath('div')[-1] str_footer = utils.handle_garbled(str_footer) str_footer = str_footer[str_footer.rfind(u'赞'):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) footer['up_num'] = up_num retweet_num = int(weibo_footer[1]) footer['retweet_num'] = retweet_num comment_num = int(weibo_footer[2]) footer['comment_num'] = comment_num return footer except Exception as e: utils.report_log(e) raise HTMLParseException
def get_publish_place(info): """获取微博发布位置""" try: div_first = info.xpath('div')[0] a_list = div_first.xpath('a') publish_place = '' for a in a_list: if ('place.weibo.com' in a.xpath('@href')[0] and a.xpath('text()')[0] == u'显示地图'): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] if (u'视频' == div_first.xpath( "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: publish_place = '' publish_place = utils.handle_garbled(publish_place) break return publish_place except Exception as e: utils.report_log(e) raise HTMLParseException
def get(self): # 获取参数 args_dict = self.args2dict() weibo_id = args_dict.get('weibo_id') if weibo_id is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) return hot = args_dict.get('hot', False) # 是否获取热评 cursor = args_dict.get('cursor', '1') try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return if cursor > SEARCH_LIMIT_PAGES: results = settings.SUCCESS.copy() results['data'] = {'result': [], 'cursor': '0'} self.write(results) return # 进行爬取 comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment, weibo_id=weibo_id, page_num=cursor) if not comment_curl_result['error_code']: self.response = comment_curl_result['response'] else: error_res = curl_result_to_api_result(comment_curl_result) self.write(error_res) return # 构建解析器 try: commonParser = CommentParser(weibo_id, response=self.response) except CookieInvalidException: self.write(WeiboCurlError.COOKIE_INVALID) return try: weibo_detail = yield commonParser.parse_one_weibo() except HTMLParseException as e: report_log(e) self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR) return # 根据 hot 参数来确定获取 comment_list 的方式 if not hot: comment_list = commonParser.get_all_comment() else: hot_comment_curl_result = yield weibo_web_curl( SpiderAim.hot_comment, weibo_id=weibo_id, page_num=cursor) if not hot_comment_curl_result['error_code']: self.hot_comment_response = hot_comment_curl_result['response'] else: error_res = curl_result_to_api_result(comment_curl_result) self.write(error_res) return try: comment_list = HotCommentParser( weibo_id, self.hot_comment_response).get_all_comment() except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log( (__class__.__name__, StatusesShowHandler.get.__name__), e) self.write(WeiboCurlError.UNKNOWN_ERROR) return # 成功时返回结果 weibo_detail['weibo_id'] = weibo_id weibo_detail['comments'] = comment_list success = settings.SUCCESS.copy() success['data'] = { 'result': weibo_detail, 'cursor': str(cursor + 1) if cursor < weibo_detail['max_page'] else '0' } # print(success) self.write(success) return
def get_retweet(self, info, weibo_id, weibo: Weibo): """获取转发微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[weibo_content.find(':') + 1:weibo_content.rfind(u'赞')] weibo_content = weibo_content[:weibo_content.rfind(u'赞')] # 检查当前是否已经为全部微博内容 a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_retweet(rev_type=dict) if wb_content: weibo_content = wb_content # 提取转发理由 if isinstance(weibo_content, dict): retweet_reason = weibo_content.get('retweet_reason') retweet_id = weibo_content.get('retweet_id') weibo_content = weibo_content.get('retweet') else: original_div = utils.handle_garbled(info.xpath('div')[-1]) retweet_reason = original_div[original_div.find(':') + 1:original_div.rindex(u'赞')] retweet_id = self.get_retweet_id(info) # 提取原始用户 original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0] original_user = ''.join(original_user_node.xpath("./text()")) original_user_id = original_user_node.get('href') if original_user_id is not None: original_user_id = original_user_id[original_user_id. rfind(r'/') + 1:] # 获取原始微博的footers original_footer_div = info.xpath(r'./div')[-2] footer_nodes = original_footer_div.xpath( r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:] original_like_num = 0 original_retweet_num = 0 original_comment_num = 0 for i, footer_node in enumerate(footer_nodes): num = ''.join(footer_node.xpath('./text()')) try: num = int(num[num.find('[') + 1:num.rfind(']')]) except BaseException: pass if i == 0: original_like_num = num elif i == 1: original_retweet_num = num elif i == 2: original_comment_num = num # 获取话题 original_div = info.xpath('./div')[0] retweet_div = info.xpath('./div')[-1] retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics( retweet_div) original_at_users, original_topics = PageParser.__get_atusers_and_topics( original_div) weibo.retweet['weibo_id'] = retweet_id weibo.retweet['user_id'] = original_user_id weibo.retweet['screen_name'] = original_user weibo.retweet['text'] = weibo_content weibo.retweet['topics'] = original_topics weibo.retweet['at_users'] = original_at_users weibo.retweet['attitudes_count'] = original_like_num weibo.retweet['comments_count'] = original_comment_num weibo.retweet['reposts_count'] = original_retweet_num weibo.topics = retweet_topics weibo.at_users = retweet_at_users weibo.text = retweet_reason except Exception as e: utils.report_log(e) raise HTMLParseException