def parse_all_content(self, response): tree_node = etree.HTML(response.body) tweet_item = response.meta['item'] content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0] tweet_html = etree.tostring(content_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item
def parse_tweet(self, response): """获取微博信息""" if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, 805): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) print(page_url) yield scrapy.Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """获取微博内容""" tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.cn/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测有没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield scrapy.Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # # 抓取该微博的评论信息 # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield scrapy.Request(url=comment_url, # callback=self.parse_comment, # meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse(self, response): if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) # 如果是搜索接口,按照天的粒度结果已经是100页,那继续按照小时的粒度进行切分 if 'search/mblog' in response.url and all_page == 100 and '-' not in response.url: start_time_string = re.search( r'starttime=(\d+)&', unquote(response.url, "utf-8")).group(1) keyword = re.search(r'keyword=(.*?)&', unquote(response.url, "utf-8")).group(1) self.logger.info( f'split by hour,{start_time_string},{keyword}, {unquote(response.url, "utf-8")}' ) date_start = datetime.datetime.strptime( start_time_string, "%Y%m%d") time_spread = datetime.timedelta(days=1) url_format_by_hour = "https://weibo.cn/search/mblog?hideSearchFrame=&keyword={}&advancedfilter=1&starttime={}&endtime={}&sort=time&atten=1&page=1" one_day_back = date_start - time_spread # from today's 7:00-8:00am to 23:00-24:00am for hour in range(7, 24): # calculation rule of starting time: start_date 8:00am + offset:16 begin_hour = one_day_back.strftime( "%Y%m%d") + "-" + str(hour + 16) # calculation rule of ending time: (end_date+1) 8:00am + offset:-7 end_hour = one_day_back.strftime("%Y%m%d") + "-" + str( hour - 7) page_url = url_format_by_hour.format( keyword, begin_hour, end_hour) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) two_day_back = one_day_back - time_spread # from today's 0:00-1:00am to 6:00-7:00am for hour in range(0, 7): # note the offset change bc we are two-days back now begin_hour = two_day_back.strftime( "%Y%m%d") + "-" + str(hour + 40) end_hour = two_day_back.strftime("%Y%m%d") + "-" + str( hour + 17) page_url = url_format_by_hour.format( keyword, begin_hour, end_hour) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = user_tweet_id.group(1) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item except Exception as e: self.logger.error(e)