def parser(self, url, logger): while 1: try: r = requests.get(url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(r.status_code)) if r.status_code == 404: logger.warning('该用户被删!无法获得用户信息!!!') self.is_del = True break if r.status_code == 200: self.content = BeautifulSoup(r.content, "lxml") break if r.status_code == 410: logger.warning('资源丢失') break except Exception as e: logger.error('请求出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue
def parser(self, i, url, logger): while 1: try: r = requests.get(url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(r.status_code)) if r.status_code == 404: logger.warning('该用户被删!无法获得用户信息!!!') self.is_del = True return elif r.status_code == 200: self.content = BeautifulSoup(r.content, "lxml") return elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/ask_topics_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) sys.exit(0) else: self.change_cookie() else: self.delLogger(logger) sys.exit(0) except Exception as e: logger.error('请求出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue
def get_askTopic(self): self.copycookies() self.get_createpoint() #从User_Asks库出发遍历用户 items = self.mongo.db.asktopics_href.find() for item in items: self.href_list.append(item.get('href')) self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.is_del = False self.content = None self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) user_url = self.href_list[i].replace('questions', 'question') logfielname = '/log/' + dt + '_ask_Topics' + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的提问所属话题').getlog() if self.content == None: self.parser(i, user_url, logger) if self.is_del == True: self.delLogger(logger) continue else: soup = self.content # 提问问题所属话题 topics = [] if soup.find('div', {'class': 'Tag QuestionTopic'}) == None: logger.warning('该提问问题没有添加所属话题标签!') else: items = soup.findAll('div', {'class': 'Tag QuestionTopic'}) for item in items: topics.append(item.get_text()) data_plus = {"href": user_url, "topics": topics} self.mongo.db.Ask_Topics.insert(data_plus) logger.info('已获取用户的记录信息') self.delLogger(logger) self.mongo.client.close()
def get_log(self): self.copycookies() self.get_createpoint() self.questionUrl_list = extract_questionUrl() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.log = [] self.is_del = False self.content = None self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.question_url = self.questionUrl_list[i] self.id = self.question_url.replace('/question/', '') self.log_url = self.url_domain + self.question_url + '/log' logfielname = '/log/' + dt + sys._getframe( ).f_code.co_name + '.log' log_questionCount = '正在爬第' + str(i + 1) + '项问题的日志' logger = Logger(logfilename=logfielname, logname=log_questionCount).getlog() self.parser(i, self.log_url, logger) if self.is_del == True: continue soup = self.content items = soup.findAll('div', class_='zm-item') for item in items: #question_editor_url : /people/devymex question_editor_url = item.find('a').get('href') #所有文字: str did = item.get_text().encode('utf-8') #编辑时间:str did_time = item.find('time').get_text() data = { 'question_editor': question_editor_url, 'did': did, 'did_time': did_time } self.log.append(data) data_plus = {"question_id": self.id, "log": self.log} self.mongo.db.question_log.insert(data_plus) logger.info('成功保存数据!') self.delLogger(logger)
def parser(self, i, url, logger): while 1: try: r = requests.get(url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(random.randint(3, 5)) logger.info('请求状态码' + str(r.status_code)) if r.status_code == 404: self.is_del = True logger.warning('!!!该问题被删!!!') self.delLogger(logger) return elif r.status_code == 200: self.content = BeautifulSoup(r.content, "lxml") return elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/question_content_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) sys.exit(0) else: self.change_cookie() with open( 'User/question_content_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) sys.exit(0) except Exception as e: logger.error(str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue
def get_userinfo(self): self.copycookies() self.get_createpoint() items = self.mongo.db.commenters_new.find() for item in items: self.userID_list.append(item.get('user_id')) self.current_proxy = get_IP() self.get_cookie() # self.user_info = extract_commenters_info() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.is_del = False self.content = None self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.user_id = self.userID_list[i] user_url = self.url_damin + str(self.user_id) + '/following' logfielname = '/log/' + dt + 'commenters_' + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的记录信息').getlog() if self.content == None: self.parser(i, user_url, logger) if self.is_del == True: self.delLogger(logger) continue else: soup = self.content # 认证和个人成就 achievements = [] if soup.find('div', {'class': 'Profile-sideColumnItems'}) == None: #print'ta还没有成就' logger.warning('ta还没有成就') else: items = soup.find('div', { 'class': 'Profile-sideColumnItems' }).findAll('div', class_='Profile-sideColumnItem') for ach in items: achievements.append(ach.get_text()) try: # 用户回答数 answer_count = soup.find('li', { 'aria-controls': 'Profile-answers' }).find('span', class_='Tabs-meta').get_text() # 用户提问数 ask_count = soup.find('li', { 'aria-controls': 'Profile-asks' }).find('span', class_='Tabs-meta').get_text() # 用户关注的人数 following_count = soup.findAll( 'a', class_='Button NumberBoard-item Button--plain')[0].find( 'strong', class_='NumberBoard-itemValue').get_text() # 用户的关注者数 follower_count = soup.findAll( 'a', class_='Button NumberBoard-item Button--plain')[1].find( 'strong', class_='NumberBoard-itemValue').get_text() except Exception, e: logger.error('信息find失败!' + str(e)) data_plus = {"user_id": self.user_id} else: data_plus = { "user_id": self.user_id, "achievement": achievements, "answer_count": int(answer_count.replace(',', '')), "ask_count": int(ask_count.replace(',', '')), "following_count": int(following_count.replace(',', '')), "follower_count": int(follower_count.replace(',', '')) } self.mongo.db.commenters_info.insert(data_plus) logger.info('已获取用户的记录信息') self.delLogger(logger) self.mongo.client.close()
def get_following(self): self.copycookies() self.get_createpoint() items = self.mongo.db.followers.find() for item in items: self.userID_list.append(item.get('user_id')) self.current_proxy = get_IP() self.get_cookie() # self.user_id_list = extract_last_followers() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.state = False self.user_id = self.userID_list[i] self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) logfielname = '/log/' + dt + 'followers_' + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的关注了').getlog() following_url = 'https://www.zhihu.com/api/v4/members/' + str( self.user_id ) + '/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20' following_count = 0 # self.following_id_list = extract_followers_following(self.user_id, self.user_id_list) # if len(self.following_id_list) == 0: # self.following_type = 0 # else: # self.following_type = 1 while 1: try: r = requests.get(following_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 200: j = r.json() following_count = j['paging']['totals'] elif r.status_code == 404: self.is_del = True logger.info('!!!该用户被删!!!') self.delLogger(logger) break elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/followers_following_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/followers_following_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) return except Exception, e: logger.error('查看回答数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: # 没有关注者的用户也要保存一下 if following_count == 0: logger.warning('用户没有关注者!') self.delLogger(logger) data_plus = { 'user_id': self.user_id, "following_count": following_count } self.mongo.db.FR_followers_following.insert(data_plus) break elif self.following_type == 0 and following_count >= 4000: logger.warning('用户关注了数大于4000!') self.delLogger(logger) data_plus = { 'user_id': self.user_id, "following_count": following_count } self.mongo.db.FR_followers_following.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(following_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求关注者出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: following_data = soup.json() data = following_data.get('data') logger.info( 'is_end?' + str(following_data['paging']['is_end'])) if following_data['paging']['is_end']: following_list = [] for i in range(0, len(data)): following_id = data[i][ 'url_token'] # 用户ID following_info = data[i] # 全部信息 info = { "following_id": following_id, "following_info": following_info } following_list.append(info) data_plus = { 'user_id': self.user_id, "following_count": following_count, # "follower_type":self.following_type, "following": following_list } self.mongo.db.FR_followers_following.insert( data_plus) logger.info('已获得所有关注了用户!') logger.info('成功保存数据!') self.delLogger(logger) break else: offset = offset + 20 following_list = [] for i in range(0, len(data)): following_id = data[i][ 'url_token'] # 用户ID # if following_id in self.following_id_list: # self.state = True following_info = data[i] # 全部信息 info = { "following_id": following_id, "following_info": following_info } following_list.append(info) data_plus = { 'user_id': self.user_id, "following_count": following_count, # "follower_type":self.following_type, "following": following_list } self.mongo.db.FR_followers_following.insert( data_plus) # if self.state: # self.delLogger(logger) # break self.delLogger(logger) self.mongo.client.close() break
def get_voters(self): self.copycookies() self.answerID_list = extract_answerID() print len(self.answerID_list) self.get_createpoint() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.state = False self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.answer_id = self.answerID_list[i] logfielname = '/log/' + dt + 'answer_' + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '项回答的点赞者').getlog() voters_url = 'https://www.zhihu.com/api/v4/answers/' + str( self.answer_id ) + '/voters?include=data%5B%2A%5D.answer_count%2Carticles_count%2Cfollower_count%2Cgender%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=10&offset={0}' voter_num = 0 self.voter_id_list = extract_answer_voters(self.answer_id) # if len(self.voter_id_list) == 0: # self.answer_type = 0 # else: # self.answer_type = 1 while 1: try: r = requests.get(voters_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 404: self.is_del = True logger.info('!!!该回答被删!!!') self.delLogger(logger) break elif r.status_code == 200: j = r.json() voter_num = j['paging']['totals'] elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/answer_voters_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/answer_voters_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) return except Exception, e: logger.error('获取点赞者数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: if voter_num == 0: logger.warning('回答没有点赞者!') data_plus = { 'answer_id': self.answer_id, "voter_num": 0 } self.mongo.db.answer_voters.insert(data_plus) self.delLogger(logger) break # elif self.answer_type == 0 and voter_num >= 4000: # logger.warning('回答点赞数大于4000!') # self.delLogger(logger) # data_plus = {'user_id': self.answer_id, "voter_num": voter_num, "answer_type":self.answer_type} # self.mongo.db.answer_voters.insert(data_plus) # break else: offset = 0 while 1: try: soup = requests.get(voters_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求点赞者出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(5) continue else: voters_data = soup.json() data = voters_data['data'] logger.info( 'is_end?' + str(voters_data['paging']['is_end'])) if voters_data['paging']['is_end']: voter_list = [] for i in range(0, len(data)): voter_url = data[i]['url_token'] #用户ID voter_info = data[i] #全部信息 info = { "voter_id": voter_url, "voter_info": voter_info } if voter_url in self.voter_id_list: break voter_list.append(info) data_plus = { 'answer_id': self.answer_id, "voter_num": voter_num, # "answer_type":self.answer_type, "voters": voter_list } self.mongo.db.answer_voters.insert( data_plus) logger.info('已获得所有新增点赞者!') break else: voter_list = [] offset = offset + 10 for i in range(0, len(data)): voter_url = data[i]['url_token'] #用户ID voter_info = data[i] #全部信息 info = { "voter_id": voter_url, "voter_info": voter_info } if voter_url in self.voter_id_list: self.state = True break voter_list.append(info) data_plus = { 'answer_id': self.answer_id, "voter_num": voter_num, # "answer_type":self.answer_type, "voters": voter_list } self.mongo.db.answer_voters.insert( data_plus) if self.state: self.delLogger(logger) break logger.info('所有数据成功保存!') self.delLogger(logger) break
def get_Topics(self): self.copycookies() self.get_createpoint() items = self.mongo.db.followers.find() for item in items: self.userID_list.append(item.get('user_id')) self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.user_id = self.userID_list[i] logfielname = '/log/' + dt + 'followers_' + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的关注话题').getlog() topics_url = 'https://www.zhihu.com/api/v4/members/' + str( self.user_id ) + '/following-topic-contributions?include=data%5B*%5D.topic.introduction&offset={0}&limit=20' topics_count = 0 while 1: try: r = requests.get(topics_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 200: j = r.json() topics_count = j['paging']['totals'] elif r.status_code == 404: self.is_del = True logger.info('!!!该用户被删!!!') self.delLogger(logger) break elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/followers_topics_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/followers_topics_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) return except Exception, e: logger.error('查看回答数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: # 没有关注者的用户也要保存一下 if topics_count == 0: logger.warning('用户没有关注话题!') self.delLogger(logger) data_plus = { 'user_id': self.user_id, "topics_count": 0 } self.mongo.db.followers_topics.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(topics_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求关注话题出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: topics_data = soup.json() data = topics_data.get('data') logger.info( 'is_end?' + str(topics_data['paging']['is_end'])) if topics_data['paging']['is_end']: topic_list = [] for i in range(0, len(data)): info = { "name": data[i]['topic']['name'], "contributions_count": data[i]['contributions_count'] } topic_list.append(info) data_plus = { 'user_id': self.user_id, "topics_count": topics_count, "topic": topic_list } self.mongo.db.followers_topics.insert( data_plus) logger.info('已获得所有用户关注话题!') logger.info('成功保存数据!') self.delLogger(logger) break else: offset = offset + 20 topic_list = [] for i in range(0, len(data)): info = { "name": data[i]['topic']['name'], "contributions_count": data[i]['contributions_count'] } topic_list.append(info) data_plus = { 'user_id': self.user_id, "topics_count": topics_count, "topic": topic_list } self.mongo.db.followers_topics.insert( data_plus) self.mongo.client.close() break
def get_question_content(self): self.copycookies() self.get_createpoint() self.questionUrl_list = extract_questionUrl() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in range(self.start, self.end): self.topic_list = [] self.is_del = False self.content = None self.file.seek(0,2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ','+ str(i+1) + ',' + str(self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.question_url = self.questionUrl_list[i] self.id = self.question_url.replace('/question/', '') logfielname = '/log/' + dt + sys._getframe().f_code.co_name + '.log' log_questionCount = '正在爬第' + str(i + 1) + '项问题的内容' logger = Logger(logfilename=logfielname, logname=log_questionCount).getlog() self.url = self.url_domain + self.question_url if self.content == None: self.parser(i, self.url, logger) if self.is_del == True: continue soup = self.content # 问题文本 self.title = soup.find('h1', {'class', 'QuestionHeader-title'}).get_text() # 所属话题 items = soup.find_all('div', class_='Tag QuestionTopic') for item in items: self.topic_list.append(item.get_text()) # 回答数 if soup.find('h4', {'class', 'List-headerText'}) == None: self.answer_num = 0 else: temp = soup.find('h4', {'class', 'List-headerText'}).get_text() self.answer_num = int(re.search(r'^\d+', temp).group()) # 浏览数一个数值 self.visit_num = int(soup.findAll('strong', class_='NumberBoard-itemValue')[1].get_text().replace(',','')) # 关注数一个数值 self.followers_num = int(soup.findAll('strong', class_='NumberBoard-itemValue')[0].get_text().replace(',','')) data = { "question_url": self.id, "title": self.title, "belong_topics": self.topic_list, "answer_num": self.answer_num, "followers_num": self.followers_num, "visit_num": self.visit_num } self.mongo.db.question_content.insert(data) logger.info('成功保存数据!') self.delLogger(logger)
def get_Answer(self): self.copycookies() self.get_createpoint() items = self.mongo.db.answerers_1.find() for item in items: self.userID_list.append(item.get('user_id')) self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.user_id = self.userID_list[i] logfielname = '/log/' + dt + '_user_Topics' + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的回答').getlog() answer_url = 'https://www.zhihu.com/api/v4/members/' + str( self.user_id ) + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created' answer_count = 0 while 1: try: r = requests.get(answer_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 200: j = r.json() answer_count = j['paging']['totals'] elif r.status_code == 404: self.is_del = True logger.info('!!!该用户被删!!!') self.delLogger(logger) break elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/user_answer_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/user_answer_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') elif r.status_code == 410: self.is_del = True logger.info('※※※资源丢失※※※') self.delLogger(logger) break else: self.delLogger(logger) return except Exception, e: logger.error('查看回答数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: # 没有关注者的用户也要保存一下 if answer_count == 0: logger.warning('用户没有回答!') self.delLogger(logger) data_plus = {'user_id': self.user_id} self.mongo.db.User_Answer.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(answer_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求回答出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: answer_data = soup.json() # print(answer_data) data = answer_data.get('data') logger.info( 'is_end?' + str(answer_data['paging']['is_end'])) if answer_data['paging']['is_end']: answer_list = [] for i in range(0, len(data)): # 回答时间 created_time = data[i]['created_time'] # 更新时间 updated_time = data[i]['updated_time'] # 回答的点赞数 int vote_count = data[i]['voteup_count'] # 回答id int answer_id = data[i]['id'] # 回答文本 answer_content = data[i]['content'] # 评论数 comment_count = data[i][ 'comment_count'] # 回答者 author_json = data[i]['author'] # 回答者url_token author_url = data[i]['author'][ 'url_token'] # 处理url以获得新的url进入下一个界面爬问题所属的话题和问题的回答数 answer_herf = data[i]['url'] question_url = data[i]['question'][ 'url'] question_url = question_url.replace( 'questions', 'question') answer_herf = answer_herf.replace( 'answers', 'answer') r1 = requests.get( question_url, headers=self.headers, timeout=5, proxies=self.current_proxy) self.content1 = BeautifulSoup( r1.content, "lxml") soup1 = self.content1 # 所属话题 self.topic_list = [] items = soup1.find_all( 'div', class_='Tag QuestionTopic') for item in items: self.topic_list.append( item.get_text()) # 回答数 if soup1.find( 'h4', {'class', 'List-headerText' }) == None: self.answer_num = 0 else: temp = soup1.find( 'h4', {'class', 'List-headerText' }).get_text().replace( ',', '') self.answer_num = int( re.search(r'^\d+', temp).group()) info = { "belong_topics": self.topic_list, "answer_num": self.answer_num, "created_time": created_time, "updated_time": updated_time, "vote_count": vote_count, "answer_id": answer_id, "answer_content": answer_content, "comment_count": comment_count, "author_json": author_json, "author_url": author_url, "answer_url": answer_herf, "question_url": question_url } answer_list.append(info) data_plus = { "question_id": data[i]['question']['id'], "answer_id": data[i]['id'], 'user_id': self.user_id, # "topics_count": topics_count, "answer": answer_list } self.mongo.db.User_Answer.insert(data_plus) logger.info('已获得所有用户回答!') logger.info('成功保存数据!') self.delLogger(logger) break else: offset = offset + 20 answer_list = [] for i in range(0, len(data)): # 回答时间 created_time = data[i]['created_time'] # 更新时间 updated_time = data[i]['updated_time'] # 回答的点赞数 int vote_count = data[i]['voteup_count'] # 回答id int answer_id = data[i]['id'] # 回答文本 answer_content = data[i]['content'] # 评论数 comment_count = data[i][ 'comment_count'] # 回答者 author_json = data[i]['author'] # 回答者url_token author_url = data[i]['author'][ 'url_token'] # 处理url以获得新的url进入下一个界面爬问题所属的话题和问题的回答数 answer_herf = data[i]['url'] question_url = data[i]['question'][ 'url'] answer_herf = answer_herf.replace( 'answers', 'answer') question_url = question_url.replace( 'questions', 'question') try: r1 = requests.get( question_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(r1.status_code)) except Exception, e: logger.error('请求回答出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue self.content1 = BeautifulSoup( r1.content, "lxml") soup1 = self.content1 # 所属话题 self.topic_list = [] items = soup1.find_all( 'div', class_='Tag QuestionTopic') for item in items: self.topic_list.append( item.get_text()) # 回答数 if soup1.find( 'h4', {'class', 'List-headerText' }) == None: self.answer_num = 0 else: temp = soup1.find( 'h4', {'class', 'List-headerText' }).get_text().replace( ',', '') self.answer_num = int( re.search(r'^\d+', temp).group()) info = { "belong_topics": self.topic_list, "answer_num": self.answer_num, "created_time": created_time, "updated_time": updated_time, "vote_count": vote_count, "answer_id": answer_id, "answer_content": answer_content, "comment_count": comment_count, "author_json": author_json, "author_url": author_url, "answer_url": answer_herf, "question_url": question_url } answer_list.append(info) data_plus = { "question_id": data[i]['question']['id'], "answer_id": data[i]['id'], 'user_id': self.user_id, # "topics_count": topics_count, "answer": answer_list } self.mongo.db.User_Answer.insert(data_plus) self.mongo.client.close() break
def get_comment(self): self.copycookies() # ※从两个合并的数据库中抽取answerID※ self.answerID_list = extract_answerID() print len(self.answerID_list) #输出回答ID列表的长度 self.get_createpoint() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.answer_id = self.answerID_list[i] logfielname = '/log/' + dt + 'answer_' + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '项回答的评论').getlog() url = r'https://www.zhihu.com/api/v4/answers/' + str( self.answer_id ) + r'/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=reverse&limit=20&offset={0}&status=open' comment_num = 0 while 1: try: r = requests.get(url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 404: #服务器找不到请求的网页 self.is_del = True logger.info('!!!该回答被删!!!') self.delLogger(logger) break elif r.status_code == 200: #成功处理了请求 j = r.json() comment_num = j['paging']['totals'] elif r.status_code == 401: #请求要求身份验证(针对于需要登陆的网页) logger.info('Cookie过期,正在更换') f = open('Cookies/answer_comments_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/answer_comments_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) return except Exception, e: logger.error('查看评论数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(5) continue else: # 没有评论的回答也要保存一下 if comment_num == 0: logger.info('回答没有评论!') self.delLogger(logger) data_plus = { 'answer_id': self.answer_id, "comment_num": 0 } self.mongo.db.answer_comments.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(url.format(str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求评论出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(5) continue else: comments_data = soup.json() data = comments_data['data'] #print 'is_end:' + str(comments_data['paging']['is_end']) logger.info( 'is_end?' + str(comments_data['paging']['is_end'])) if comments_data['paging']['is_end']: comment_list = [] for i in range(0, len(data)): commenter_id = data[i]['author'][ 'member']['url_token'] #用户ID comment_info = data[i] #全部信息 info = { "commenter_id": commenter_id, "comment_info": comment_info } if comment_info[ 'created_time'] > 1500566400: comment_list.append(info) else: break data_plus = { 'answer_id': self.answer_id, "comment_num": comment_num, "comments": comment_list } self.mongo.db.answer_comments.insert( data_plus) logger.info('已获得该回答下所有评论!') break else: offset = offset + 20 comment_list = [] for i in range(0, len(data)): commenter_id = data[i]['author'][ 'member']['url_token'] #用户ID comment_info = data[i] #全部信息 info = { "commenter_id": commenter_id, "comment_info": comment_info } if comment_info[ 'created_time'] > 1500566400: comment_list.append(info) else: self.state = True break data_plus = { 'answer_id': self.answer_id, "comment_num": comment_num, "comments": comment_list } self.mongo.db.answer_comments.insert( data_plus) if self.state: self.delLogger(logger) break logger.info('所有数据成功保存!') self.delLogger(logger) break
def get_followers(self): self.copycookies() self.get_createpoint() self.questionUrl_list = extract_questionUrl() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.is_del = False self.state = False self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.question_url = self.questionUrl_list[i] self.id = self.question_url.replace('/question/', '') logfielname = '/log/' + dt + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '项问题的关注者').getlog() followers_url = u'https://www.zhihu.com/api/v4/questions/' + str( self.id ) + u'/followers?include=data[*].gender,answer_count,articles_count,follower_count,is_following,is_followed&limit=10&offset={0}' follower_num = 0 self.follower_id_list = extract_question_followers(self.id) while 1: try: r = requests.get(followers_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 404: self.is_del = True logger.info('!!!该问题被删!!!') self.delLogger(logger) break elif r.status_code == 200: j = r.json() follower_num = j['paging']['totals'] elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/question_followers_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') return else: self.change_cookie() with open( 'User/question_followers_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: return except Exception, e: logger.error('获取关注者数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: # 没有关注者的问题也要保存一下 if follower_num == 0: logger.warning('问题没有关注者!') data_plus = {'question_id': self.id, "follower_num": 0} self.mongo.db.question_followers.insert(data_plus) self.delLogger(logger) break else: offset = 0 while 1: try: soup = requests.get(followers_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求关注者出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: followers_data = soup.json() data = followers_data.get('data') #print 'is_end:' + str(followers_data['paging']['is_end']) logger.info( 'is_end?' + str(followers_data['paging']['is_end'])) if followers_data['paging']['is_end']: follower_list = [] for i in range(0, len(data)): follower_url = data[i][ 'url_token'] #用户ID follower_info = data[i] #全部信息 info = { "follower_url": follower_url, "follower_info": follower_info } if follower_url in self.follower_id_list: break follower_list.append(info) data_plus = { "question_id": self.id, "follower_num": follower_num, "followers": follower_list } self.mongo.db.question_followers.insert( data_plus) logger.info('已经获得所有新增关注者!') logger.info('成功保存数据!') self.delLogger(logger) break else: offset = offset + 10 follower_list = [] for i in range(0, len(data)): follower_url = data[i][ 'url_token'] #用户ID follower_info = data[i] #全部信息 info = { "follower_url": follower_url, "follower_info": follower_info } if follower_url in self.follower_id_list: self.state = True break follower_list.append(info) data_plus = { "question_id": self.id, "follower_num": follower_num, "followers": follower_list } self.mongo.db.question_followers.insert( data_plus) if self.state: self.delLogger(logger) break self.delLogger(logger) break
def get_Asks(self): self.copycookies() self.get_createpoint() items = self.mongo.db.answerers_1.find() for item in items: self.userID_list.append(item.get('user_id')) self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.user_id = self.userID_list[i] logfielname = '/log/' + dt + '_user_Asks' + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '个用户的提问信息').getlog() #https://www.zhihu.com/api/v4/members/cang-hai-qing-yue/questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor%2Cadmin_closed_comment&offset={0}&limit=20 asks_url = 'https://www.zhihu.com/api/v4/members/' + str( self.user_id ) + '/questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor%2Cadmin_closed_comment&offset={0}&limit=20' asks_count = 0 while 1: try: r = requests.get(asks_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 200: j = r.json() asks_count = j['paging']['totals'] elif r.status_code == 404: self.is_del = True logger.info('!!!该用户被删!!!') self.delLogger(logger) break elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/user_asks_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() elif r.status_code == 410: self.is_del = True logger.info('※※※资源丢失※※※') self.delLogger(logger) break else: self.delLogger(logger) return except Exception, e: logger.error('查看提问数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: # 没有提问的用户也要保存一下 if asks_count == 0: logger.warning('用户没有提问问题!') self.delLogger(logger) data_plus = {'user_id': self.user_id, "asks_count": 0} self.mongo.db.User_Asks.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(asks_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求用户提问出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: asks_data = soup.json() data = asks_data.get('data') logger.info('is_end?' + str(asks_data['paging']['is_end'])) if asks_data['paging']['is_end']: ask_list = [] for i in range(0, len(data)): info = { "question_id": data[i]['id'], "content": data[i]['title'], "answer_count": data[i]['answer_count'], "follower_count": data[i]['follower_count'], "create_time": data[i]['created'], "update_time": data[i]['updated_time'], "href": data[i]['url'] } ask_list.append(info) data_plus = { 'user_id': self.user_id, "asks_count": asks_count, "ask_list": ask_list } self.mongo.db.User_Asks.insert(data_plus) logger.info('已获得所有用户关注话题!') logger.info('成功保存数据!') self.delLogger(logger) break else: offset = offset + 20 ask_list = [] for i in range(0, len(data)): info = { "question_id": data[i]['id'], "content": data[i]['title'], "answer_count": data[i]['answer_count'], "follower_count": data[i]['follower_count'], "create_time": data[i]['created'], "update_time": data[i]['updated_time'], "href": data[i]['url'] } ask_list.append(info) data_plus = { 'user_id': self.user_id, "asks_count": asks_count, "ask_list": ask_list } self.mongo.db.User_Asks.insert(data_plus) self.mongo.client.close() break
def get_answers(self): self.copycookies() self.get_createpoint() self.questionUrl_list = extract_questionUrl() self.current_proxy = get_IP() self.get_cookie() dt = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))) for i in xrange(self.start, self.end): self.is_del = False self.file.seek(0, 2) dt1 = re.sub(r'[^0-9]', '', str(datetime.datetime.now().strftime('%Y-%m-%d'))) News = self.type + ',' + str(i + 1) + ',' + str( self.end) + ',' + str(dt1) + '\n' self.file.write(News) self.question_url = self.questionUrl_list[i] self.id = self.question_url.replace('/question/', '') logfielname = '/log/' + dt + sys._getframe( ).f_code.co_name + '.log' logger = Logger(logfilename=logfielname, logname='正在爬取第' + str(i + 1) + '项问题的回答').getlog() answer_url = 'https://www.zhihu.com/api/v4/questions/' + str( self.id ) + '/answers?include=data%5B*%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created' answer_number = 0 while 1: try: r = requests.get(answer_url, headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('第一次请求状态码' + str(r.status_code)) if r.status_code == 404: self.is_del = True logger.info('!!!该问题被删!!!') self.delLogger(logger) break elif r.status_code == 200: j = r.json() answer_number = j['paging']['totals'] elif r.status_code == 401: logger.info('Cookie过期,正在更换') f = open('Cookies/question_answers_cookies.txt', "r") Lines = f.readlines() if len(Lines) == 0: logger.info('备用Cookies用完!') self.delLogger(logger) return else: self.change_cookie() with open( 'User/question_answers_loseuser_' + str(self.fileNum) + '.txt', 'a+') as f1: f1.write(str(i + 1) + '\n') else: self.delLogger(logger) return except Exception, e: logger.error('查看回答数出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) else: # 没有回答的问题也要保存一下 if answer_number == 0: logger.warning('该问题没有回答!') self.delLogger(logger) data_plus = {'question_id': self.id, "answer_num": 0} self.mongo.db.question_answers.insert(data_plus) break else: offset = 0 while 1: try: soup = requests.get(answer_url.format( str(offset)), headers=self.headers, timeout=5, proxies=self.current_proxy) time.sleep(3) logger.info('请求状态码' + str(soup.status_code)) except Exception, e: logger.error('请求回答出错!' + str(e)) self.current_proxy = get_IP() logger.warning('切换ip代理!中断3秒!') time.sleep(3) continue else: answer_data = soup.json() answer_info = answer_data['data'] if answer_data['paging']['is_end']: answer_list = [] for i in range(0, len(answer_info)): #回答时间 created_time = answer_info[i][ 'created_time'] #更新时间 updated_time = answer_info[i][ 'updated_time'] #回答的点赞数 int vote_count = answer_info[i][ 'voteup_count'] #回答id int answer_id = answer_info[i]['id'] #回答文本 answer_content = answer_info[i][ 'content'] #评论数 comment_count = answer_info[i][ 'comment_count'] #回答者 author_json = answer_info[i]['author'] #回答者url_token author_url = answer_info[i]['author'][ 'url_token'] data = { "created_time": created_time, "updated_time": updated_time, "vote_count": vote_count, "answer_id": answer_id, "answer_content": answer_content, "comment_count": comment_count, "author_json": author_json, "author_url": author_url, } if updated_time > 1500393600: answer_list.append(data) else: break data_plus = { "question_id": self.id, "answer_num": answer_number, "answers": answer_list } self.mongo.db.question_answers.insert( data_plus) logger.info('已获得该问题下所有回答!') break else: offset = offset + 20 answer_list = [] for i in range(0, 20): #回答时间 created_time = answer_info[i][ 'created_time'] #更新时间 updated_time = answer_info[i][ 'updated_time'] #回答的点赞数 int vote_count = answer_info[i][ 'voteup_count'] #回答id int answer_id = answer_info[i]['id'] #回答文本 answer_content = answer_info[i][ 'content'] #评论数 comment_count = answer_info[i][ 'comment_count'] #回答者 author_json = answer_info[i]['author'] #回答者url_token author_url = answer_info[i]['author'][ 'url_token'] data = { "created_time": created_time, "updated_time": updated_time, "vote_count": vote_count, "answer_id": answer_id, "answer_content": answer_content, "comment_count": comment_count, "author_json": author_json, "author_url": author_url, } if updated_time > 1500393600: answer_list.append(data) else: self.state = True break data_plus = { "question_id": self.id, "answer_num": answer_number, "answers": answer_list } self.mongo.db.question_answers.insert( data_plus) if self.state: self.delLogger(logger) break logger.info('成功保存数据!') self.delLogger(logger) break