def parse_tweet(self, response): page_url = response.url tweet_item = TweetsItem() tree_node = etree.HTML(response.body) tweet_content_node = tree_node.xpath('.//span[@class="ctt"]')[0] all_content = tweet_content_node.xpath('string(.)').strip('\u200b') tweet_item['content'] = all_content tweet_item['crawl_time'] = int(time.time()) user_tweet_id = re.search(r'https://weibo.cn/(\d+)/(.*)', page_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(1), user_tweet_id.group(2)) tweet_item['user_id'] = user_tweet_id.group(1) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tree_node.xpath( './/span[@class="ct" and contains(text(),"来自")]/text()')[0] tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) like_num = tree_node.xpath('.//a[contains(text(),"赞[")]/text()')[0] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tree_node.xpath('.//a[contains(text(),"转发[")]/text()')[0] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tree_node.xpath( './/span[@class="pms" and contains(text(),"评论[")]/text()')[0] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) yield tweet_item comment_url = page_url + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': page_url})
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) tree_node = etree.HTML(response.body) comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href') if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url[0]).group(1) comment_item['content'] = extract_comment_content(etree.tostring(comment_node, encoding='unicode')) comment_item['_id'] = comment_node.xpath('./@id')[0] created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0] like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] comment_item['like_num'] = int(re.search('\d+', like_num).group()) comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0]) yield comment_item
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1) comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first() comment_item['_id'] = comment_node.xpath('./@id').extract_first() created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix(created_at.split('\xa0')[0]) yield comment_item
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath('.//span[@class="ct" and contains(text(),"来自")]/text()')[0] tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip()) like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[0] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[0] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[0] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0] # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath('.//a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath('string(.)').strip('\u200b') tweet_item['content'] = all_content yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_rep(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_rep, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath('//div[@class="c"]') for comment_node in comment_nodes: try: comment_user_url = comment_node.xpath( './/a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = RepostItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] uid = re.search(r'/u/(\d+)', comment_user_url).group(1) comment_item['rep_user_id'] = uid # 给comment_info 用来抓转发用户论信息 yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_rep_information) content = comment_node.xpath('./text()').extract_first() # at_list = comment_node.xpath('.//span[@class="ctt"]').xpath('a').xpath('string(.)').extract() # for a in at_list: # content = content.replace(a, '@*' + a + '*@') comment_item['content'] = content # comment_item['_id'] = comment_node.xpath('./@id').extract_first() # create_at = '\xa012月20日 22:28\xa0来自红米Note7 4800万相机' created_at = comment_node.xpath( './/span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix( created_at.split('\xa0')[1]) except: pass else: try: comment_item['created_device'] = created_at.split( '\xa0')[2] except: pass else: pass yield comment_item
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) if (all_page > self.MAX_COMMENT_PAGES): all_page = self.MAX_COMMENT_PAGES for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath( '//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath( './/a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search( r'/u/(\d+)', comment_user_url).group(1) comment_item['content'] = comment_node.xpath( './/span[@class="ctt"]').xpath('string(.)').extract_first() created_at = comment_node.xpath( './/span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix(created_at.split('\xa0')[0]) try: s = SnowNLP(comment_item['content']) comment_item['sentiments'] = str(s.sentiments * 10)[0:8] except: comment_item['sentiments'] = '5.0' try: sql = "INSERT INTO `sbhdb`.`weibo_comment`( `comment_user_id`, `content`, `weibo_url`, `created_at`, `crawl_time`, `sentiments`) VALUES ( '%s', '%s', '%s', '%s', %s,%s)" % ( comment_item['comment_user_id'], comment_item['content'], comment_item['weibo_url'], comment_item['created_at'], comment_item['crawl_time'], comment_item['sentiments']) self.cursor.execute(sql) self.db.commit() except: # 数据有重复 pass yield comment_item
def parse_repost(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_repost, dont_filter=True, meta=response.meta, priority=2) tree_node = etree.HTML(response.body) repost_nodes = tree_node.xpath( '//div[@class="c" and not(@id="M_") and .//span[contains(@class,"cc")]]' ) for repost_node in repost_nodes: repost_user_url = repost_node.xpath( './/a[contains(@href,"/")]/@href') if not repost_user_url: continue repost_item = RepostItem() repost_item['crawl_time_utc'] = dt.utcnow() repost_item['weibo_url'] = response.meta['weibo_url'] repost_item['repost_user_id'] = re.search( r'(/u/(\d+))|(/(\w+))', repost_user_url[0]).group(0) repost_item['content'] = repost_node.xpath('.//text()')[0] like_num = repost_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] repost_item['like_num'] = int(re.search('\d+', like_num).group()) created_at_info = repost_node.xpath( './/span[@class="ct"]/text()')[0] #print("[DEBUG] repost CT:"+created_at_info) #print("[DEBUG] repost CT:"+created_at_info.split('\xa0')[0]) repost_item['created_at'] = time_fix( created_at_info.strip('\xa0').split('\xa0')[0]) #print("[DEBUG] repost CT:"+repost_item['created_at']) repost_item['_id'] = repost_item['repost_user_id'] + repost_item[ 'weibo_url'] hot_repost = repost_node.xpath('.//span[@class="kt"]/text()') if hot_repost: repost_item['hot_repost'] = True else: repost_item['hot_repost'] = False yield repost_item
def parse_repost(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_repost, dont_filter=True, meta=response.meta) selector = Selector(response) repost_nodes = selector.xpath('//div[@class="c"]') for repost_node in repost_nodes: repost_user_url = repost_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first() if not repost_user_url: continue repost_item = RepostItem() repost_item['crawl_time'] = int(time.time()) repost_item['weibo_url'] = response.meta['weibo_url'] repost_item['repost_user_id'] = re.search(r'/u/(\d+)', repost_user_url).group(1) attitude_text = repost_node.xpath('.//span[@class="cc"]/a[contains(@href,"/attitude/")]//text()').extract_first() #转发点赞 attitude = re.findall('赞\[(\d+)\]', str(attitude_text)) if attitude is not None and len(attitude)>=1: repost_item['attitude'] = int(attitude[0]) #转发评论 text1 = ";".join(repost_node.xpath('.//text()').extract()) comment = re.findall(';:(.*?);', text1) if comment and comment[0]: repost_item["comment"] = comment[0].replace(u"\xa0", "") created_at = repost_node.xpath('.//span[@class="ct"]/text()').extract_first() if created_at is not None: #转发时间 repost_item['created_at'] = time_fix(created_at.split('\xa0')[1]) #转发设备 repost_item['device'] = created_at.split('\xa0')[2] repost_item['weibo_user_id'] = re.search(r'.*?com/((\d+))/',repost_item['weibo_url']).group(1) repost_item['_id'] = repost_item['repost_user_id'] +'-'+ repost_item['weibo_user_id'] yield Request(url="https://weibo.cn/{}/info".format(repost_item['repost_user_id']), callback=self.parse_information, priority=1) yield repost_item
def parse_like(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath('//div[@class="c"]') for comment_node in comment_nodes: try: comment_user_url = comment_node.xpath( './/a[contains(@href,"/u/")]/@href').extract_first() except: pass else: if not comment_user_url: continue comment_item = LikeItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] uid = re.search(r'/u/(\d+)', comment_user_url).group(1) comment_item['like_user_id'] = uid # 给comment_info 用来抓评论人信息 yield Request(url="https://weibo.cn/%s/info" % uid, callback=self.parse_like_information) created_at = comment_node.xpath( './/span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix( created_at.split('\xa0')[0]) yield comment_item
def parse(self, response): current_page = int(response.url.split("page=")[-1]) print("[INFO] Crawling Tweets Page: "+str(current_page)) print("[INFO Crawling URL: " + response.url) selector = Selector(response) tweetpage_item = TimelinePageRaw() tweetpage_item['user_id'] = re.findall("(\d+)\?page",response.url)[0] tweetpage_item['page_url'] = re.sub("https://.*?/fireprox",self.weibo_baseurl,response.url) tweetpage_item['page_raw'] = selector.extract() # get raw page content tweetpage_item['crawl_time_utc'] = dt.utcnow() yield tweetpage_item time_stop_flag = 0 # stop crawling if hit specified start time tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') if len(tweet_nodes) < 1: # no information on this page update = ProfileUpdateItem() update["timelineCrawlJob_current_complete"] = True update["timelineCrawlJob_current_page"] = current_page update["timelineCrawlJob_run_history"] = tweetpage_item['crawl_time_utc'] update["uid"] = tweetpage_item['user_id'] yield update return for tweet_node in tweet_nodes: try: create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') print("[INFO] create_time_info raw: " + create_time_info) if "来自" in create_time_info: created_at = time_fix(create_time_info.split('来自')[0].strip()) time_stop_flag = self.time_flag_compare(created_at) # time compare to trigger stop flag else: created_at = time_fix(create_time_info.strip()) time_stop_flag = self.time_flag_compare(created_at) # time compare to trigger stop flag except Exception as e: self.logger.error(e) # keep looping until hit page with time range limit print("[DEBUG] timeflag:" + str(time_stop_flag)) update = ProfileUpdateItem() if time_stop_flag == 0: next_page = current_page + 1 page_url = self.get_base_url() + '/{}?page={}'.format(tweetpage_item['user_id'],next_page) #page_url = response.url.replace('page='+str(current_page), 'page={}'.format(next_page)) update["timelineCrawlJob_current_page"] = current_page update["timelineCrawlJob_current_complete"] = False update["uid"] = tweetpage_item['user_id'] yield update yield Request(page_url, self.parse, dont_filter=True, meta=response.meta,priority=1) else: update["timelineCrawlJob_current_complete"] = True update["timelineCrawlJob_run_history"] = tweetpage_item['crawl_time_utc'] update["timelineCrawlJob_current_page"] = current_page update["uid"] = tweetpage_item['user_id'] yield update
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath('.//span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix(create_time_info.strip()) like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0] # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath('.//a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath('string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') # 总爬虫数加一 for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() self.total_scrap_num += 1 tweet_item['dataset_id'] = self.dataset_id tweet_item['blogger_id'] = self.blogger_id tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) # 设置爬虫终点,最多爬几天前的微博, 最多爬多少条 time_now = datetime.datetime.now() created_time = datetime.datetime.strptime( tweet_item['created_at'], "%Y-%m-%d %H:%M") if ( time_now - created_time ).days > MAX_INTERVAL & self.total_scrap_num > MAX_SCRAP_NUM: # TODO 删除这条Twitter记录 mongodb_operation.delete_twitter_rec( weibo_url=tweet_item['weibo_url'], dataset_id=self.dataset_id) return # TODO 假如微博已经存在, 则删除过去微博记录以及评论记录 mongodb_operation.delete_previous_twitter_rec( weibo_url=tweet_item['weibo_url'], current_dataset_id=self.dataset_id) mongodb_operation.delete_previous_comment_under_twitter( weibo_url=tweet_item['weibo_url'], current_dataset_id=self.dataset_id) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[0:] yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse(self, response): """ 解析搜索页面 :param response: :return: """ if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # TODO 这里加上获取提问者的user_id的解析 # 换一下顺序,放到回答者的request后面,不行,这里的asker_name_url并不包含info # 通过xpath结合正则表达式提取提问者的user_id,或者直接就是他的个人页面,但是这里获得的是以昵称为url的,跳转之后返回的就是id了 asker_name_urltxt = tweet_node.xpath( './/a[contains(text(),"@")]/text()')[0] asker_name_url = self.base_url + tweet_node.xpath( './/a[contains(text(),"@")]/@href')[0] # print('提问者的url', asker_name_url) tweet_item['asker_name'] = asker_name_urltxt.split('@')[-1] # asker_name_url = self.base_url + asker_name_url # print('提问者的昵称', tweet_item['asker_name']) response_nickname = requests.get(asker_name_url) response_url = response_nickname.url if 'weibo.cn/u/' in response_url: nickname_id = response_url.split('weibo.cn/u/')[-1] else: nickname_id = response_url.split('uid=')[-1] # TODO 这里yield一个提问者的request # https://blog.csdn.net/rgc_520_zyl/article/details/78946974 # header = {,'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} # asker 表的 _id 应该是tweet _id yield Request( url="https://weibo.cn/{}/info".format(nickname_id), callback=self.parse_information, priority=3, meta={'asker_from': tweet_item['weibo_url']}) # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[1:] yield tweet_item # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']), # callback=self.parse_information, priority=2) yield Request(url="https://weibo.cn/{}/info".format( tweet_item['user_id']), callback=self.parse_information, priority=2) # TODO 检测有无评论,如果有yield一个parse_comment if tweet_item['comment_num'] > 0: # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}, priority=5) except Exception as e: self.logger.error(e)
def parse(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 # /> :html中的空格占位符 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) # 选取所有的div元素+属性class=c+拥有id属性 tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) # todo 转发和评论的url tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] tweet_cmt_url = tweet_node.xpath( './/a[contains(text(),"评论[")]/@href')[0] # 发送微博的用户id user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) # _id作为微博的id tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] # 去掉时间后面的部分 比如来自新浪微博/来自iphone if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # 检测有没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[1:] yield tweet_item # todo # 爬取评论用户信息和评论内容 if tweet_item['comment_num'] > 0: yield Request(url=tweet_cmt_url, callback=self.parse_cmt_info, meta={'weibo_id': tweet_item['_id']}, priority=3) # # 爬取发微博的用户信息 # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']), # callback=self.parse_information, priority=1) # todo 爬去转发用户的信息 # todo 爬去转发用户的id和评论,赞数,加上微博的id可以匹配 except Exception as e: self.logger.error(e)
def parse_cmt_info(self, response): if not response.url.__contains__('page'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url + '&page=' + str(page_num) yield Request(page_url, self.parse_cmt_info, meta=response.meta, priority=2) # todo 需不需要 meta=response.meta """ 解析评论页面 """ cmt_tree_node = etree.HTML(response.body) cmt_nodes = cmt_tree_node.xpath('//div[@class="c" and @id]') weibo_id = response.meta['weibo_id'] for cmt_node in cmt_nodes: try: cmt = CommentItem() cmt['weibo_id'] = weibo_id cmt['_id'] = cmt_node.xpath('./@id')[0] user_url = cmt_node.xpath('.//a/@href')[0] cmt['comment_user_id'] = re.search(r'\d+', user_url).group() create_time_info = cmt_node.xpath( './/span[@class="ct"]/text()')[-1] # 去掉时间后面的部分 比如来自新浪微博/来自iphone if "来自" in create_time_info: cmt['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: cmt['created_at'] = time_fix(create_time_info.strip()) isreply = cmt_node.xpath('.//span[@class="ctt"]/text()') # cmt['content'] = contents.xpath('string(.)').replace('\u200b', '').strip() # 去掉text()[0]的":" content_node = cmt_node.xpath('.//span[@class="ctt"]')[0] all_content = content_node.xpath('string(.)').replace( '\u200b', '').strip() # todo # if isreply[0].__contains__('回复'): # cmt_node['reply_id'] = content_node.xpath('.//a/@href')[0] # print('+++++++++++++++++++') # print(cmt_node['reply_id']) cmt['content'] = all_content like_num = cmt_node.xpath( './/span[@class="cc"]/a[contains(text(),"赞[")]/text()')[0] cmt['like_num'] = int(re.search(r'\d+', like_num).group()) cmt['crawl_time'] = int(time.time()) # 评论用户的信息 yield Request(url="https://weibo.cn/{}/info".format( cmt['comment_user_id']), callback=self.parse_info, priority=3, meta={'cmt_item': cmt}) except Exception as e: print('error happened ar comment parsing') self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) if (all_page > self.MAX_WEIBO_PAGES): all_page = self.MAX_WEIBO_PAGES for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) #时间最低日期 if (tweet_item['created_at'] < self.MIN_WEIBO_DATE): 1 else: like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content_text = tweet_node.xpath('string(.)') if '转发理由:' in all_content_text: all_content_text = all_content_text.split( '转发理由:')[1] all_content_text = all_content_text.split( '\xa0', maxsplit=1)[0] tweet_item['content'] = all_content_text.strip() try: s = SnowNLP(tweet_item['content']) tweet_item['sentiments'] = str(s.sentiments * 10)[0:8] except: tweet_item['sentiments'] = '5.0' try: sql = "INSERT INTO `sbhdb`.`weibo_info`( `weibo_url`, `user_id`, `content`, `created_at`, `repost_num`, `comment_num`, `like_num`, `crawl_time`, `sentiments`) VALUES ('%s', '%s', '%s', '%s', %s,%s, %s, %s,%s)" % ( tweet_item['weibo_url'], tweet_item['user_id'], tweet_item['content'], tweet_item['created_at'], tweet_item['repost_num'], tweet_item['comment_num'], tweet_item['like_num'], tweet_item['crawl_time'], tweet_item['sentiments']) self.cursor.execute(sql) self.db.commit() except: # 数据有重复 continue pass yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # if page 1, get all page number self.current_page = 1 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) self.all_page_num = all_page print("[INFO] Crawling Tweets Page: " + str(self.current_page)) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time_utc'] = dt.utcnow( ) # insert datetime timestamp utc tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) # if tweet_item['user_id']: # print("[DEBUG] user_id:" + str(tweet_item['user_id'])) # else: # print("[DEBUG] user_id ERROR") tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = "" like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) #print("[DEBUG] like_num:" + str(tweet_item['like_num'])) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) #print("[DEBUG] repost_num:" + str(tweet_item['repost_num'])) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) #print("[DEBUG] comment_num:" + str(tweet_item['comment_num'])) # Add to grab all images 1) test if multi images link exists 2) if not use the multi_img_link = tweet_node.xpath( './/a[contains(text(),"组图")]/@href') if multi_img_link: #print("[DEBUG] multi_img_link:" + multi_img_link[-1]) tweet_item['multi_imgs'] = True yield Request(url=multi_img_link[-1], callback=self.parse_multi_images, meta={'_id': tweet_item['_id']}, priority=1) else: tweet_item['multi_imgs'] = False images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] else: tweet_item['image_url'] = "NA" videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] else: tweet_item['video_url'] = "NA" map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info else: tweet_item['location_map_info'] = "NA" repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['retweet'] = True tweet_item['origin_weibo'] = repost_node[0] # crawl original weibo # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2) else: tweet_item['retweet'] = False tweet_item['origin_weibo'] = "NA" # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) # Crawl tweet repost repost_url = self.base_url + '/repost/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) except Exception as e: self.logger.error(e) # keep looping until hit page with time range limit self.current_page = self.current_page + 1 if self.time_stop_flag == 0 and self.current_page < ( self.all_page_num + 1) and self.current_page >= 2: next_page = self.current_page current_page_str = "page=" + str(next_page - 1) page_url = response.url.replace(current_page_str, 'page={}'.format(next_page)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta, priority=1)
def parse_tweet(self, response): if response.url.endswith('page=1'): # if page 1, get all page number all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) self.all_page_num = all_page current_page = int(response.url.split("page=")[-1]) print("[INFO] Crawling Tweets Page: " + str(current_page)) print("[INFO Crawling URL" + response.url) """ 解析本页的数据 """ selector = Selector(response) tweetpage_item = TweetPageItem() tweetpage_item['user_id'] = response.meta["user_id"] tweetpage_item['page_url'] = response.url.replace( self.base_url, self.weibo_baseurl) tweetpage_item['page_raw'] = selector.extract() # get raw page content tweetpage_item['crawl_time_utc'] = dt.utcnow() yield tweetpage_item time_stop_flag = 0 # stop crawling if hit specified start time tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') if len(tweet_nodes) < 1: return for tweet_node in tweet_nodes: try: create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') print("[INFO] create_time_info raw: " + create_time_info) if "来自" in create_time_info: created_at = time_fix( create_time_info.split('来自')[0].strip()) time_stop_flag = self.time_flag_compare( created_at) # time compare to trigger stop flag else: created_at = time_fix(create_time_info.strip()) time_stop_flag = self.time_flag_compare( created_at) # time compare to trigger stop flag # 检测由没有阅读全文: # all_content_link = tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]') # if all_content_link: # all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] # yield Request(all_content_url, callback=self.parse_all_content, meta={'user_id': response.meta["user_id"]}, # priority=1) # 抓取该微博的评论信息 # comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']},priority=2) # Crawl tweet repost # repost_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority=2) except Exception as e: self.logger.error(e) # keep looping until hit page with time range limit print("[DEBUG] timeflag:" + str(time_stop_flag)) if time_stop_flag == 0: next_page = current_page + 1 page_url = response.url.replace('page=' + str(current_page), 'page={}'.format(next_page)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta, priority=1)