def get_publish_time(info): """获取微博发布时间""" try: str_time = info.xpath("div/span[@class='ct']") str_time = utils.handle_garbled(str_time[0]) publish_time = str_time.split(u'来自')[0] if u'刚刚' in publish_time: publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') elif u'分钟' in publish_time: minute = publish_time[:publish_time.find(u'分钟')] minute = timedelta(minutes=int(minute)) publish_time = (datetime.now() - minute).strftime('%Y-%m-%d %H:%M') elif u'今天' in publish_time: today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] publish_time = today + ' ' + time if len(publish_time) > 16: publish_time = publish_time[:16] elif u'月' in publish_time: year = datetime.now().strftime('%Y') month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] return publish_time except Exception as e: utils.report_log(e) raise HTMLParseException
def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u'赞')] a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_weibo() if wb_content: weibo_content = wb_content # 获取topics和at_users at_users, topics = PageParser.__get_atusers_and_topics(info) return weibo_content, topics, at_users except Exception as e: utils.report_log(e) raise HTMLParseException
def get_article_url(info): """获取微博头条文章的url""" article_url = '' text = utils.handle_garbled(info) if text.startswith(u'发布了头条文章'): url = info.xpath('.//a/@href') if url and url[0].startswith('https://weibo.cn/sinaurl'): article_url = url[0] return article_url
def _parse_one_comment(node): comment = CommentParser.COMMENT_TEMPLATE.copy() span_nodes = node.xpath('./span') for span_node in span_nodes: klass = span_node.get('class') if klass == 'kt': comment['is_hot'] = True elif klass == 'ctt': comment['content'] = ''.join(span_node.xpath('./text()')) elif klass == 'cc': text = ''.join(span_node.xpath('./a/text()')) pos = text.find('赞') if pos != -1: comment['like_num'] = text[pos + 2:-1] elif klass == 'ct': str_time = utils.handle_garbled(span_node) # 获取发布工具 if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split('来自')[1] else: publish_tool = '' comment['publish_tool'] = publish_tool.strip() # 获取发布时间 publish_time = str_time.split(u'来自')[0] if u'刚刚' in publish_time: publish_time = datetime.now().strftime('%Y-%m-%d %H:%M') elif u'分钟' in publish_time: minute = publish_time[:publish_time.find(u'分钟')] minute = timedelta(minutes=int(minute)) publish_time = (datetime.now() - minute).strftime('%Y-%m-%d %H:%M') elif u'今天' in publish_time: today = datetime.now().strftime('%Y-%m-%d') time = publish_time[3:] publish_time = today + ' ' + time if len(publish_time) > 16: publish_time = publish_time[:16] elif u'月' in publish_time: year = datetime.now().strftime('%Y') month = publish_time[0:2] day = publish_time[3:5] time = publish_time[7:12] publish_time = year + '-' + month + '-' + day + ' ' + time else: publish_time = publish_time[:16] comment['publish_time'] = publish_time user_node = node.xpath('./a')[0] comment['screen_name'] = user_node.xpath('./text()')[0] user_href = user_node.get('href') comment['user_id'] = user_href[user_href.rfind(r'/') + 1:] return comment
def get_publish_tool(info): """获取微博发布工具""" try: str_time = info.xpath("div/span[@class='ct']") str_time = utils.handle_garbled(str_time[0]) if len(str_time.split(u'来自')) > 1: publish_tool = str_time.split(u'来自')[1] else: publish_tool = '' return publish_tool except Exception as e: utils.report_log(e) raise HTMLParseException
def get_long_weibo(self): """获取长原创微博""" try: for i in range(5): if self.selector is not None: info = self.selector.xpath("//div[@id='M_']")[0] wb_content = utils.handle_garbled(info) wb_time = info.xpath("//span[@class='ct']/text()")[0] weibo_content = wb_content[wb_content.find(':') + 1:wb_content.rfind(wb_time)] if weibo_content is not None: return weibo_content sleep(random.randint(6, 10)) except Exception as e: utils.report_log(e) raise HTMLParseException
def get_weibo_footer(info): """获取微博点赞数、转发数、评论数""" try: footer = {} pattern = r'\d+' str_footer = info.xpath('div')[-1] str_footer = utils.handle_garbled(str_footer) str_footer = str_footer[str_footer.rfind(u'赞'):] weibo_footer = re.findall(pattern, str_footer, re.M) up_num = int(weibo_footer[0]) footer['up_num'] = up_num retweet_num = int(weibo_footer[1]) footer['retweet_num'] = retweet_num comment_num = int(weibo_footer[2]) footer['comment_num'] = comment_num return footer except Exception as e: utils.report_log(e) raise HTMLParseException
def get_publish_place(info): """获取微博发布位置""" try: div_first = info.xpath('div')[0] a_list = div_first.xpath('a') publish_place = '' for a in a_list: if ('place.weibo.com' in a.xpath('@href')[0] and a.xpath('text()')[0] == u'显示地图'): weibo_a = div_first.xpath("span[@class='ctt']/a") if len(weibo_a) >= 1: publish_place = weibo_a[-1] if (u'视频' == div_first.xpath( "span[@class='ctt']/a/text()")[-1][-2:]): if len(weibo_a) >= 2: publish_place = weibo_a[-2] else: publish_place = '' publish_place = utils.handle_garbled(publish_place) break return publish_place except Exception as e: utils.report_log(e) raise HTMLParseException
def get_retweet(self, info, weibo_id, weibo: Weibo): """获取转发微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[weibo_content.find(':') + 1:weibo_content.rfind(u'赞')] weibo_content = weibo_content[:weibo_content.rfind(u'赞')] # 检查当前是否已经为全部微博内容 a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_retweet(rev_type=dict) if wb_content: weibo_content = wb_content # 提取转发理由 if isinstance(weibo_content, dict): retweet_reason = weibo_content.get('retweet_reason') retweet_id = weibo_content.get('retweet_id') weibo_content = weibo_content.get('retweet') else: original_div = utils.handle_garbled(info.xpath('div')[-1]) retweet_reason = original_div[original_div.find(':') + 1:original_div.rindex(u'赞')] retweet_id = self.get_retweet_id(info) # 提取原始用户 original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0] original_user = ''.join(original_user_node.xpath("./text()")) original_user_id = original_user_node.get('href') if original_user_id is not None: original_user_id = original_user_id[original_user_id. rfind(r'/') + 1:] # 获取原始微博的footers original_footer_div = info.xpath(r'./div')[-2] footer_nodes = original_footer_div.xpath( r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:] original_like_num = 0 original_retweet_num = 0 original_comment_num = 0 for i, footer_node in enumerate(footer_nodes): num = ''.join(footer_node.xpath('./text()')) try: num = int(num[num.find('[') + 1:num.rfind(']')]) except BaseException: pass if i == 0: original_like_num = num elif i == 1: original_retweet_num = num elif i == 2: original_comment_num = num # 获取话题 original_div = info.xpath('./div')[0] retweet_div = info.xpath('./div')[-1] retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics( retweet_div) original_at_users, original_topics = PageParser.__get_atusers_and_topics( original_div) weibo.retweet['weibo_id'] = retweet_id weibo.retweet['user_id'] = original_user_id weibo.retweet['screen_name'] = original_user weibo.retweet['text'] = weibo_content weibo.retweet['topics'] = original_topics weibo.retweet['at_users'] = original_at_users weibo.retweet['attitudes_count'] = original_like_num weibo.retweet['comments_count'] = original_comment_num weibo.retweet['reposts_count'] = original_retweet_num weibo.topics = retweet_topics weibo.at_users = retweet_at_users weibo.text = retweet_reason except Exception as e: utils.report_log(e) raise HTMLParseException