def parse_all_content(self, response): # 有阅读全文的情况,获取全文 tree_node = etree.HTML(response.body) tweet_item = response.meta['item'] content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0] tweet_html = etree.tostring(content_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item
def parse_all_content(self, response): # 有阅读全文的情况,获取全文 tree_node = etree.HTML(response.body) tweet_item = response.meta['item'] content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0] tweet_html = etree.tostring(content_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) if 'location' in tweet_item: tweet_item['location'] = content_node.xpath( './/span[@class="ctt"]/a[last()]/text()')[0] yield tweet_item
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # if page 1, get all page number self.current_page = 1 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) self.all_page_num = all_page print("[INFO] Crawling Tweets Page: " + str(self.current_page)) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time_utc'] = dt.utcnow( ) # insert datetime timestamp utc tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) # if tweet_item['user_id']: # print("[DEBUG] user_id:" + str(tweet_item['user_id'])) # else: # print("[DEBUG] user_id ERROR") tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = "" like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) #print("[DEBUG] like_num:" + str(tweet_item['like_num'])) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) #print("[DEBUG] repost_num:" + str(tweet_item['repost_num'])) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) #print("[DEBUG] comment_num:" + str(tweet_item['comment_num'])) # Add to grab all images 1) test if multi images link exists 2) if not use the multi_img_link = tweet_node.xpath( './/a[contains(text(),"组图")]/@href') if multi_img_link: #print("[DEBUG] multi_img_link:" + multi_img_link[-1]) tweet_item['multi_imgs'] = True yield Request(url=multi_img_link[-1], callback=self.parse_multi_images, meta={'_id': tweet_item['_id']}, priority=1) else: tweet_item['multi_imgs'] = False images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] else: tweet_item['image_url'] = "NA" videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] else: tweet_item['video_url'] = "NA" map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info else: tweet_item['location_map_info'] = "NA" repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['retweet'] = True tweet_item['origin_weibo'] = repost_node[0] # crawl original weibo # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2) else: tweet_item['retweet'] = False tweet_item['origin_weibo'] = "NA" # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) # Crawl tweet repost repost_url = self.base_url + '/repost/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) except Exception as e: self.logger.error(e) # keep looping until hit page with time range limit self.current_page = self.current_page + 1 if self.time_stop_flag == 0 and self.current_page < ( self.all_page_num + 1) and self.current_page >= 2: next_page = self.current_page current_page_str = "page=" + str(next_page - 1) page_url = response.url.replace(current_page_str, 'page={}'.format(next_page)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta, priority=1)