def parse_tweet(self, response): page_url = response.url tweet_item = TweetsItem() tree_node = etree.HTML(response.body) tweet_content_node = tree_node.xpath('.//span[@class="ctt"]')[0] all_content = tweet_content_node.xpath('string(.)').strip('\u200b') tweet_item['content'] = all_content tweet_item['crawl_time'] = int(time.time()) user_tweet_id = re.search(r'https://weibo.cn/(\d+)/(.*)', page_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(1), user_tweet_id.group(2)) tweet_item['user_id'] = user_tweet_id.group(1) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tree_node.xpath( './/span[@class="ct" and contains(text(),"来自")]/text()')[0] tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) like_num = tree_node.xpath('.//a[contains(text(),"赞[")]/text()')[0] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tree_node.xpath('.//a[contains(text(),"转发[")]/text()')[0] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tree_node.xpath( './/span[@class="pms" and contains(text(),"评论[")]/text()')[0] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) yield tweet_item comment_url = page_url + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': page_url})
def parse_tweets(self, response): """ 抓取微博数据 """ selector = Selector(response) ID = re.findall('(\d+)/profile', response.url)[0] divs = selector.xpath('body/div[@class="c" and @id]') for div in divs: try: tweetsItems = TweetsItem() id = div.xpath('@id').extract_first() # 微博ID real_id = id.split('_')[1] content = div.xpath( 'div/span[@class="ctt"]//text()').extract() # 微博内容 comment = re.findall('评论\[(\d+)\]', div.extract()) # 评论数 tweetsItems["Comment"] = 0 tweetsItems["_id"] = ID + "-" + id tweetsItems["ID"] = ID if content: tweetsItems["Content"] = " ".join(content).strip( '[位置]') # 去掉最后的"[位置]" if comment: tweetsItems["Comment"] = int(comment[0]) yield tweetsItems #enter a tweet #https://weibo.cn/comment/ + real_id if int(comment[0]) > 0: yield Request(url="https://weibo.cn/comment/%s" % real_id, callback=self.parse_comment) except Exception as e: self.logger.info(e) pass
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath('.//a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath('.//span[@class="ct" and contains(text(),"来自")]/text()')[0] tweet_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip()) like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[0] tweet_item['like_num'] = int(re.search('\d+', like_num).group()) repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[0] tweet_item['repost_num'] = int(re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[0] tweet_item['comment_num'] = int(re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath('.//span[@class="ctt"]')[0] # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath('.//a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath('string(.)').strip('\u200b') tweet_item['content'] = all_content yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweets(self, response): """ 抓取微博数据 """ selector = Selector(response) ID = re.findall('(\d+)/profile', response.url)[0] divs = selector.xpath('body/div[@class="c" and @id]') for div in divs: try: tweetsItems = TweetsItem() id = div.xpath('@id').extract_first() # 微博ID content = div.xpath( 'div/span[@class="ctt"]//text()').extract() # 微博内容 cooridinates = div.xpath('div/a/@href').extract() # 定位坐标 like = re.findall('赞\[(\d+)\]', div.extract()) # 点赞数 transfer = re.findall('转发\[(\d+)\]', div.extract()) # 转载数 comment = re.findall('评论\[(\d+)\]', div.extract()) # 评论数 others = div.xpath('div/span[@class="ct"]/text()').extract( ) # 求时间和使用工具(手机或平台) tweetsItems["_id"] = ID + "-" + id tweetsItems["ID"] = ID if content: tweetsItems["Content"] = " ".join(content).strip( '[位置]') # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d.,]+)', cooridinates[0]) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others[0].split('来自') tweetsItems["PubTime"] = others[0].replace(u"\xa0", "") if len(others) == 2: tweetsItems["Tools"] = others[1].replace(u"\xa0", "") yield tweetsItems except Exception as e: self.logger.info(e) pass url_next = selector.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' ).extract() if url_next: yield Request(url=self.host + url_next[0], callback=self.parse_tweets, dont_filter=True)
def parseTweets(self, response): if len(response.body) > 50: print "###########################" print "Fetch Tweets Success" print "###########################" tweets = json.loads(response.body) ID = response.meta["ID"] page = '' containerid = '' if tweets.get("cards", ""): cards = tweets["cards"] if tweets["cardlistInfo"].get("page", ""): page = tweets["cardlistInfo"]["page"] page = str(page) else: return if tweets["cardlistInfo"].get("containerid", ""): containerid = tweets["cardlistInfo"]["containerid"] for card in cards: mblog = card.get('mblog', '') if mblog: tweetsItems = TweetsItem() tweetsItems["_id"] = card["itemid"] tweetsItems["ID"] = ID tweetsItems["Content"] = json.dumps(mblog) tweetsItems["PubTime"] = mblog["created_at"] tweetsItems["Like"] = mblog["attitudes_count"] tweetsItems["Comment"] = mblog["comments_count"] tweetsItems["Transfer"] = mblog["reposts_count"] yield tweetsItems print "###########################" print "Tweetspage: " + page print "###########################" url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s&page=%s" % ( ID, containerid, page) yield Request(url=url_tweets, meta={"ID": ID}, callback=self.parseTweets, dont_filter=True) else: return else: print "###########################" print "Fetch Tweets Finish" print "###########################" return
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # if page 1, get all page number self.current_page = 1 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) self.all_page_num = all_page print("[INFO] Crawling Tweets Page: " + str(self.current_page)) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time_utc'] = dt.utcnow( ) # insert datetime timestamp utc tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) # if tweet_item['user_id']: # print("[DEBUG] user_id:" + str(tweet_item['user_id'])) # else: # print("[DEBUG] user_id ERROR") tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath( './/span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = create_time_info.split( '来自')[1].strip() else: tweet_item['created_at'] = time_fix( create_time_info.strip()) self.time_stop_flag = self.time_flag_compare( tweet_item['created_at'] ) # time compare to trigger stop flag tweet_item['tool'] = "" like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) #print("[DEBUG] like_num:" + str(tweet_item['like_num'])) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) #print("[DEBUG] repost_num:" + str(tweet_item['repost_num'])) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) #print("[DEBUG] comment_num:" + str(tweet_item['comment_num'])) # Add to grab all images 1) test if multi images link exists 2) if not use the multi_img_link = tweet_node.xpath( './/a[contains(text(),"组图")]/@href') if multi_img_link: #print("[DEBUG] multi_img_link:" + multi_img_link[-1]) tweet_item['multi_imgs'] = True yield Request(url=multi_img_link[-1], callback=self.parse_multi_images, meta={'_id': tweet_item['_id']}, priority=1) else: tweet_item['multi_imgs'] = False images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] else: tweet_item['image_url'] = "NA" videos = tweet_node.xpath( './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href' ) if videos: tweet_item['video_url'] = videos[0] else: tweet_item['video_url'] = "NA" map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info else: tweet_item['location_map_info'] = "NA" repost_node = tweet_node.xpath( './/a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['retweet'] = True tweet_item['origin_weibo'] = repost_node[0] # crawl original weibo # origin_weibo_url = self.base_url + '/repost/' + tweet_item['weibo_url'].split('/')[-1] + '?page=1' # yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']},priority = 2) else: tweet_item['retweet'] = False tweet_item['origin_weibo'] = "NA" # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) # Crawl tweet repost repost_url = self.base_url + '/repost/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=repost_url, callback=self.parse_repost, meta={'weibo_url': tweet_item['weibo_url']}, priority=2) except Exception as e: self.logger.error(e) # keep looping until hit page with time range limit self.current_page = self.current_page + 1 if self.time_stop_flag == 0 and self.current_page < ( self.all_page_num + 1) and self.current_page >= 2: next_page = self.current_page current_page_str = "page=" + str(next_page - 1) page_url = response.url.replace(current_page_str, 'page={}'.format(next_page)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta, priority=1)
def parse_tweets(self, response): """ functions: 1. catch each tweet 2. request next page if existed """ ID = re.findall('(\d+)/profile', response.url)[0] divs = response.xpath('body/div[@class="c" and @id]') for div in divs: try: tweetsItems = TweetsItem() # _id and ID id = div.xpath('@id').extract_first() tweetsItems["_id"] = ID + "-" + id tweetsItems["ID"] = ID # content if div.xpath('div/span[@class="ctt"]//text()').extract(): content = div.xpath( 'div/span[@class="ctt"]//text()').extract() content = " ".join(content).strip('[位置]').strip() # parse content tweetsItems["Content"] = content.replace( u"\u200b", "").replace(u"\xa0 全文", "") # coordinates if div.xpath('div/a/@href').extract(): cooridinates = div.xpath('div/a/@href').extract() cooridinates = re.findall('center=([\d.,]+)', cooridinates[0]) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] # like if re.findall('赞\[(\d+)\]', div.extract()): like = re.findall('赞\[(\d+)\]', div.extract()) tweetsItems["Like"] = int(like[0]) # transfer if re.findall('转发\[(\d+)\]', div.extract()): transfer = re.findall('转发\[(\d+)\]', div.extract()) tweetsItems["Transfer"] = int(transfer[0]) # comment: if re.findall('评论\[(\d+)\]', div.extract()): comment = re.findall('评论\[(\d+)\]', div.extract()) tweetsItems["Comment"] = int(comment[0]) # date and equipments/platform if div.xpath('div/span[@class="ct"]/text()').extract(): others = div.xpath( 'div/span[@class="ct"]/text()').extract() others = others[0].split('来自') tweetsItems["PubTime"] = others[0].replace(u"\xa0", "") if len(others) == 2: tweetsItems["Tools"] = others[1].replace(u"\xa0", "") print(tweetsItems) yield tweetsItems except Exception as e: self.logger.info(e) pass # request next page next_url = "https://weibo.cn" + response.xpath( "//div[@class='pa']/form/div/a[1]/@href").extract() if next_url: yield Request(url=next_url[0], callback=self.parse_tweets, dont_filter=True)
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') # 总爬虫数加一 for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() self.total_scrap_num += 1 tweet_item['dataset_id'] = self.dataset_id tweet_item['blogger_id'] = self.blogger_id tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) # 设置爬虫终点,最多爬几天前的微博, 最多爬多少条 time_now = datetime.datetime.now() created_time = datetime.datetime.strptime( tweet_item['created_at'], "%Y-%m-%d %H:%M") if ( time_now - created_time ).days > MAX_INTERVAL & self.total_scrap_num > MAX_SCRAP_NUM: # TODO 删除这条Twitter记录 mongodb_operation.delete_twitter_rec( weibo_url=tweet_item['weibo_url'], dataset_id=self.dataset_id) return # TODO 假如微博已经存在, 则删除过去微博记录以及评论记录 mongodb_operation.delete_previous_twitter_rec( weibo_url=tweet_item['weibo_url'], current_dataset_id=self.dataset_id) mongodb_operation.delete_previous_comment_under_twitter( weibo_url=tweet_item['weibo_url'], current_dataset_id=self.dataset_id) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[0:] yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parse(self, response): """ 解析搜索页面 :param response: :return: """ if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # TODO 这里加上获取提问者的user_id的解析 # 换一下顺序,放到回答者的request后面,不行,这里的asker_name_url并不包含info # 通过xpath结合正则表达式提取提问者的user_id,或者直接就是他的个人页面,但是这里获得的是以昵称为url的,跳转之后返回的就是id了 asker_name_urltxt = tweet_node.xpath( './/a[contains(text(),"@")]/text()')[0] asker_name_url = self.base_url + tweet_node.xpath( './/a[contains(text(),"@")]/@href')[0] # print('提问者的url', asker_name_url) tweet_item['asker_name'] = asker_name_urltxt.split('@')[-1] # asker_name_url = self.base_url + asker_name_url # print('提问者的昵称', tweet_item['asker_name']) response_nickname = requests.get(asker_name_url) response_url = response_nickname.url if 'weibo.cn/u/' in response_url: nickname_id = response_url.split('weibo.cn/u/')[-1] else: nickname_id = response_url.split('uid=')[-1] # TODO 这里yield一个提问者的request # https://blog.csdn.net/rgc_520_zyl/article/details/78946974 # header = {,'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} # asker 表的 _id 应该是tweet _id yield Request( url="https://weibo.cn/{}/info".format(nickname_id), callback=self.parse_information, priority=3, meta={'asker_from': tweet_item['weibo_url']}) # 检测由没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[1:] yield tweet_item # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']), # callback=self.parse_information, priority=2) yield Request(url="https://weibo.cn/{}/info".format( tweet_item['user_id']), callback=self.parse_information, priority=2) # TODO 检测有无评论,如果有yield一个parse_comment if tweet_item['comment_num'] > 0: # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}, priority=5) except Exception as e: self.logger.error(e)
def parse1(self, response): '''抓取个人信息2''' #因为有的字段不存在,而mysql需要提取字段,为保存不出错,先统一默认为空,monogodb不存在这个问题,是因为mongodb直接将dict插入数据库,不需要对每个字段赋值 informationItems = InformationItem() informationItems['NickName'] = '' informationItems['Gender'] = '' informationItems['City'] = '' informationItems['URL'] = '' informationItems['Num_Fans'] = '' informationItems['Num_Follows'] = '' informationItems['Num_Tweets'] = '' informationItems['Province'] = '' informationItems['Signature'] = '' # informationItems = response.meta["item"] selector = Selector(response) ID = re.findall('weibo\.cn/(\d+)', response.url)[0] text1 = ";".join( selector.xpath('body/div[@class="c"]/text()').extract()) print('text1的数据是:') print(text1) nickname = re.findall('昵称[:|:](.*?);', text1) # 昵称 gender = re.findall('性别[:|:](.*?);', text1) # 性别 place = re.findall('地区[:|:](.*?);', text1) # 地区(包括省份和城市) signature = re.findall('简介[:|:](.*?);', text1) # 个性签名 birthday = re.findall('生日[:|:](.*?);', text1) # 生日 sexorientation = re.findall('性取向[:|:](.*?);', text1) # 性取向 marriage = re.findall('感情状况[:|:](.*?);', text1) # 婚姻状况 url = re.findall('互联网[:|:](.*?);', text1) # 首页链接 print('nieckname和gender的数据是:') print(nickname) print(gender) if nickname: informationItems["NickName"] = nickname[0] if gender: informationItems['Gender'] = gender[0] if place: place = place[0].split(' ') informationItems['Province'] = place[0] if len(place) > 1: informationItems['City'] = place[1] if signature: informationItems['Signature'] = signature[0] if birthday: try: birthday = datatime.datetime.strptime(birthday[0], "%Y-%m-%d") informationItems["Birthday"] = birthday - datetime.timedelta( hours=8) except Exception: pass if sexorientation: if sexorientation[0] == gender[0]: informationItems["Sex_Orientation"] = "gay" else: informationItems["Sex_Orientation"] = "Heterosexual" if marriage: informationItems['Marriage'] = marriage[0] if url: informationItems["URL"] = url[0] urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID r = requests.get(urlothers, cookies=response.request.cookies) if r.status_code == 200: selector = etree.HTML(r.content) texts = ';'.join(selector.xpath('//div[@class="tip2"]/a/text()')) print('texts的数据是') print(texts) if texts: num_tweets = re.findall('微博\[(\d+)\]', texts) #微博数 num_follows = re.findall('关注\[(\d+)\]', texts) #关注数 num_fans = re.findall('粉丝\[(\d+)\]', texts) #粉丝数 if num_tweets: informationItems['Num_Tweets'] = int(num_tweets[0]) if num_follows: informationItems['Num_Follows'] = int(num_follows[0]) if num_fans: informationItems['Num_Fans'] = int(num_fans[0]) print('informationItems的数据是:') print(informationItems) yield informationItems contents = [] tweets = TweetsItem() tweets['_id'] = ID tweets['Content'] = contents yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, meta={ 'item': tweets, 'contents': contents }, callback=self.parse_tweets)
def parse(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 # /> :html中的空格占位符 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) # 选取所有的div元素+属性class=c+拥有id属性 tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) # todo 转发和评论的url tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] tweet_cmt_url = tweet_node.xpath( './/a[contains(text(),"评论[")]/@href')[0] # 发送微博的用户id user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) # _id作为微博的id tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] # 去掉时间后面的部分 比如来自新浪微博/来自iphone if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) tweet_content_node = tweet_node.xpath( './/span[@class="ctt"]')[0] # 检测有没有阅读全文: all_content_link = tweet_content_node.xpath( './/a[text()="全文"]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content = tweet_content_node.xpath( 'string(.)').replace('\u200b', '').strip() tweet_item['content'] = all_content[1:] yield tweet_item # todo # 爬取评论用户信息和评论内容 if tweet_item['comment_num'] > 0: yield Request(url=tweet_cmt_url, callback=self.parse_cmt_info, meta={'weibo_id': tweet_item['_id']}, priority=3) # # 爬取发微博的用户信息 # yield Request(url="https://weibo.cn/{}/info".format(tweet_item['user_id']), # callback=self.parse_information, priority=1) # todo 爬去转发用户的信息 # todo 爬去转发用户的id和评论,赞数,加上微博的id可以匹配 except Exception as e: self.logger.error(e)
def parse_tweet(self, response): if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) if (all_page > self.MAX_WEIBO_PAGES): all_page = self.MAX_WEIBO_PAGES for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['crawl_time'] = int(time.time()) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search(r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = 'https://weibo.com/{}/{}'.format( user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) create_time_info = tweet_node.xpath( './/span[@class="ct"]/text()')[-1] if "来自" in create_time_info: tweet_item['created_at'] = time_fix( create_time_info.split('来自')[0].strip()) else: tweet_item['created_at'] = time_fix( create_time_info.strip()) #时间最低日期 if (tweet_item['created_at'] < self.MIN_WEIBO_DATE): 1 else: like_num = tweet_node.xpath( './/a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = int( re.search('\d+', like_num).group()) repost_num = tweet_node.xpath( './/a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = int( re.search('\d+', repost_num).group()) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()' )[-1] tweet_item['comment_num'] = int( re.search('\d+', comment_num).group()) # 检测由没有阅读全文: all_content_link = tweet_node.xpath( './/a[text()="全文" and contains(@href,"ckAll=1")]') if all_content_link: all_content_url = self.base_url + all_content_link[ 0].xpath('./@href')[0] yield Request(all_content_url, callback=self.parse_all_content, meta={'item': tweet_item}, priority=1) else: all_content_text = tweet_node.xpath('string(.)') if '转发理由:' in all_content_text: all_content_text = all_content_text.split( '转发理由:')[1] all_content_text = all_content_text.split( '\xa0', maxsplit=1)[0] tweet_item['content'] = all_content_text.strip() try: s = SnowNLP(tweet_item['content']) tweet_item['sentiments'] = str(s.sentiments * 10)[0:8] except: tweet_item['sentiments'] = '5.0' try: sql = "INSERT INTO `sbhdb`.`weibo_info`( `weibo_url`, `user_id`, `content`, `created_at`, `repost_num`, `comment_num`, `like_num`, `crawl_time`, `sentiments`) VALUES ('%s', '%s', '%s', '%s', %s,%s, %s, %s,%s)" % ( tweet_item['weibo_url'], tweet_item['user_id'], tweet_item['content'], tweet_item['created_at'], tweet_item['repost_num'], tweet_item['comment_num'], tweet_item['like_num'], tweet_item['crawl_time'], tweet_item['sentiments']) self.cursor.execute(sql) self.db.commit() except: # 数据有重复 continue pass yield tweet_item # 抓取该微博的评论信息 comment_url = self.base_url + '/comment/' + tweet_item[ 'weibo_url'].split('/')[-1] + '?page=1' yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e)
def parseTweets(self, response): if len(response.body) > 50: print "###########################" print "Fetch Tweets Success" print "###########################" ori_ID = response.meta['ori_id'] tweets = json.loads(response.body) ID = response.meta["ID"] Owner = response.meta["owner"] page = '' containerid = '' if tweets.get("cards", ""): cards = tweets["cards"] if tweets["cardlistInfo"].get("page", ""): page = tweets["cardlistInfo"]["page"] page = str(page) else: return # if tweets["cardlistInfo"].get("containerid", ""): # containerid = tweets["cardlistInfo"]["containerid"] for card in cards: mblog = card.get('mblog', '') if mblog: tweetsItems = TweetsItem() tweetsItems["_id"] = mblog["id"] tweetsItems["ID"] = ID tweetsItems["Owner"] = Owner tweetsItems["Used"] = False tweetsItems['LocalImgs'] = [] tweetsItems["Content"] = json.dumps(mblog).decode( 'unicode-escape') tweetsItems["PubTime"] = mblog["created_at"] tweetsItems["Like"] = mblog["attitudes_count"] tweetsItems["Comment"] = mblog["comments_count"] tweetsItems["Transfer"] = mblog["reposts_count"] tweetsItems["TweetsText"] = mblog["text"] pics = mblog.get('pics', '') if pics: img_urls = [] small_img_urls = [] # print mblog["pics"] for pic in pics: url = pic["large"]['url'] surl = pic['url'] # print url img_urls.append(url) small_img_urls.append(surl) tweetsItems["Imgs"] = img_urls tweetsItems['SmallImgs'] = small_img_urls else: tweetsItems["Imgs"] = [] tweetsItems['SmallImgs'] = [] yield tweetsItems print "###########################" print "Tweetspage: " + page print "###########################" if page >= Tweets_Num: print "###########################" print "Fetch Tweets Finish" print "###########################" return # url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value={value}&containerid={ori_id}&page=%s" % ( # ID, containerid, page) ori_url = 'https://m.weibo.cn/api/container/getIndex?containerid={ori_id}_-_WEIBO_SECOND_PROFILE_WEIBO_ORI' \ '&type=uid&page_type=03&value={value}&page={page}'.format( ori_id=ori_ID, value=response.meta['ID'],page=page ) yield Request(url=ori_url, meta={"ID": ID}, callback=self.parseTweets, dont_filter=True) else: return else: print "###########################" print "Fetch Tweets Finish" print "###########################" return