def parse(self, response): if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) tree_node = etree.HTML(response.body) comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href') if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_id'] = response.url.split('/')[-1].split('?')[0] comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url[0]).group(1) comment_item['content'] = extract_comment_content(etree.tostring(comment_node, encoding='unicode')) comment_item['_id'] = comment_node.xpath('./@id')[0] created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0] like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] comment_item['like_num'] = int(re.search('\d+', like_num).group()) comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0]) yield comment_item
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath( '//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath( './/a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search( r'/u/(\d+)', comment_user_url).group(1) comment_item['content'] = comment_node.xpath( './/span[@class="ctt"]').xpath('string(.)').extract_first() comment_item['_id'] = comment_node.xpath('./@id').extract_first() created_at = comment_node.xpath( './/span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix(created_at.split('\xa0')[0]) yield comment_item
def parse(self, response): if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) all_page = all_page if all_page <= 50 else 50 for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) tree_node = etree.HTML(response.body) repo_nodes = tree_node.xpath('//div[@class="c" and not(contains(@id,"M_"))]') for repo_node in repo_nodes: repo_user_url = repo_node.xpath('.//a[contains(@href,"/u/")]/@href') if not repo_user_url: continue repo_item = RepostItem() #repo_item['_id'] = '' repo_item['crawl_time'] = int(time.time()) repo_item['weibo_id'] = response.url.split('/')[-1].split('?')[0] repo_item['user_id'] = re.search(r'/u/(\d+)', repo_user_url[0]).group(1) content = extract_repost_content(etree.tostring(repo_node, encoding='unicode')) repo_item['content'] = content.split(':', maxsplit=1)[1] created_at_info = repo_node.xpath('.//span[@class="ct"]/text()')[0].split('\xa0') repo_item['created_at'] = time_fix((created_at_info[0]+created_at_info[1])) yield repo_item
def parse_tweet(self, response): """获取微博信息""" if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, 805): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) print(page_url) yield scrapy.Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """获取微博内容""" tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.cn/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测有没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield scrapy.Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # # 抓取该微博的评论信息 # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield scrapy.Request(url=comment_url, # callback=self.parse_comment, # meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse(self, response): if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) # 如果是搜索接口,按照天的粒度结果已经是100页,那继续按照小时的粒度进行切分 if 'search/mblog' in response.url and all_page == 100 and '-' not in response.url: start_time_string = re.search( r'starttime=(\d+)&', unquote(response.url, "utf-8")).group(1) keyword = re.search(r'keyword=(.*?)&', unquote(response.url, "utf-8")).group(1) self.logger.info( f'split by hour,{start_time_string},{keyword}, {unquote(response.url, "utf-8")}' ) date_start = datetime.datetime.strptime( start_time_string, "%Y%m%d") time_spread = datetime.timedelta(days=1) url_format_by_hour = "https://weibo.cn/search/mblog?hideSearchFrame=&keyword={}&advancedfilter=1&starttime={}&endtime={}&sort=time&atten=1&page=1" one_day_back = date_start - time_spread # from today's 7:00-8:00am to 23:00-24:00am for hour in range(7, 24): # calculation rule of starting time: start_date 8:00am + offset:16 begin_hour = one_day_back.strftime( "%Y%m%d") + "-" + str(hour + 16) # calculation rule of ending time: (end_date+1) 8:00am + offset:-7 end_hour = one_day_back.strftime("%Y%m%d") + "-" + str( hour - 7) page_url = url_format_by_hour.format( keyword, begin_hour, end_hour) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) two_day_back = one_day_back - time_spread # from today's 0:00-1:00am to 6:00-7:00am for hour in range(0, 7): # note the offset change bc we are two-days back now begin_hour = two_day_back.strftime( "%Y%m%d") + "-" + str(hour + 40) end_hour = two_day_back.strftime("%Y%m%d") + "-" + str( hour + 17) page_url = url_format_by_hour.format( keyword, begin_hour, end_hour) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = user_tweet_id.group(1) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item except Exception as e: self.logger.error(e)
def parse(self, response): tweet_item = TweetItem() tweet_item['crawl_time'] = int(time.time()) selector = Selector(response) # todo: 完成单个微博页面数据的爬取,可参考 tweet.py、fan.py 的实现 # 页面布局参考 https://weibo.cn/comment/JltR1rvSK。 ''' tweet_item['crawl_time'] = '' tweet_repost_url = '' user_tweet_id = '' tweet_item['weibo_url'] = '' tweet_item['user_id'] = '' tweet_item['_id'] = '' tweet_item['created_at'] = '' tweet_item['tool'] = '' tweet_item['created_at'] = '' tweet_item['like_num'] = '' tweet_item['repost_num'] = '' tweet_item['comment_num'] = '' tweet_item['image_url'] = '' tweet_item['video_url'] = '' ''' if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) tweet_item = TweetItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = selector.xpath('.//a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = user_tweet_id.group(1) create_time_info_node = selector.xpath('.//span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split('来自')[1].strip() else: tweet_item['created_at'] = time_fix(create_time_info.strip()) like_num = selector.xpath('.//a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = selector.xpath('.//a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = selector.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) images = selector.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images videos = selector.xpath('.//a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href') if videos: tweet_item['video_url'] = videos map_node = selector.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = selector.xpath('.//a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] all_content_link = selector.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: # 用于在其他函数中对内容进一步处理时,将已有数据放在 request.meta['item'] 中传递 request_meta = response.meta request_meta['item'] = tweet_item yield tweet_item
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ all_page = re.search(r'/> (\d+)/(\d+)页</div>', response.text) tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct" and contains(text(),"来自")]/text()')[0] tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_repost_node = tweet_node.xpath( './/span[@class="ctt"]')[0] tweet_original_node = tweet_node.xpath('.//span[@class="cmt"]') # 检测由没有阅读全文: all_content_link = tweet_repost_node.xpath('.//a[text()="全文"]') # 如果是原创微博,只有ctt # 如果是转发微博,ctt是转发微博的内容,cmt是转发理由 if tweet_original_node: repost_contemt = tweet_original_node[0].xpath('string(.)').strip().replace(u"\xa0", " ") + \ tweet_repost_node.xpath('string(.)').replace('\u200b', '').replace('\u2028', '').strip() content = re.findall( '(.*?)//', re.findall(r'转发理由:(.*?)赞', tweet_node.xpath('string(.)'))[0].replace( '\u2028', '').strip()) if content: original_content = "转发理由:" + content[0] else: original_content = "转发理由:" + \ re.findall(r'转发理由:(.*?)赞', tweet_node.xpath('string(.)'))[0]\ .replace('\u2028', '').strip() tweet_item[ 'content'] = repost_contemt + "+" + original_content yield tweet_item else: if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_repost_node.xpath('string(.)').replace('\u200b', '')\ .replace('\u2028', '').strip() tweet_item['content'] = all_content yield tweet_item # tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0] # 检测由没有阅读全文: # all_content_link = tweet_content_node.xpath('.//a[text()="全文"]') # if all_content_link: # all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] # yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, # priority=1) # # else: # all_content = tweet_content_node.xpath('string(.)').replace('\u200b', '').strip() # tweet_item['content'] = all_content # # yield tweet_item # print(tweet_item) # 抓取该微博的评论信息 # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=comment_url, callback=self.parse_comment, # meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e) if all_page: if response.url.endswith(all_page.group(2)): while True: user = self.collection.find_one({'tweet_flag': 'false'}) if user is not None: uid = user['_id'] yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) self.collection.update_one( {'_id': uid}, {'$set': { 'tweet_flag': 'true' }}) break else: print("暂时没有可爬取的id") time.sleep(10) else: while True: user = self.collection.find_one({'tweet_flag': 'false'}) if user is not None: uid = user['_id'] yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_information) self.collection.update_one( {'_id': uid}, {'$set': { 'tweet_flag': 'true' }}) break else: print("暂时没有可爬取的id") time.sleep(10)