class SpiderTopicData(object): """ 获取知乎数据 """ def __init__(self): """ 初始化相关参数 """ self.login = Login() self.login.login() self.headers = self.login.getHeaders() self.data = self.login.getData() self.session = self.login.getSession() # self.session.encoding = 'utf8' # print self.session. # data link list self.havefinished_list = [] # 已经处理的数据连接 self.waitting_list = [] # 未处理的数据连接 self.record_topic_data = dict() self.record_topic_link_data = dict() def setRootTopic(self,root_topic_id='19778317'): self.waitting_list.append(root_topic_id) def relogin(self): """ 短线重连 :return: """ self.login.login() self.headers = self.login.getHeaders() self.data = self.login.getData() self.session = self.login.getSession() def getLinkTopic(self,link_url='https://www.zhihu.com/topic/19778317/organize/entire?parent=19778317'): """ 获取连接下的数据 :param link_url: :return: 数据 text """ # self.session try: res = self.session.post(link_url,data=self.data,headers=self.headers) time.sleep(random.randint(4,12)) except: self.relogin() res = self.session.post(link_url,data=self.data,headers=self.headers) time.sleep(random.randint(4,12)) topic = json.loads(res.text) # topic = eval(res.text) cur_topic = topic['msg'][0] sub_topics = topic['msg'][1] # for test parent_topic_name = cur_topic[1].encode('utf8') parent_topic_id = cur_topic[2] sub_topics_name = [] sub_topics_id = [] for sub in sub_topics: sub_topics_id.append(sub[0][2]) sub_topics_name.append(sub[0][1].encode('utf8')) result = dict() result["parent_topic_id"] = parent_topic_id result["parent_topic_name"] = parent_topic_name result["sub_topics_name"] = sub_topics_name result["sub_topics_id"] = sub_topics_id return result def recordData(self,result): """ 处理爬取的topic 数据,并记录 :param result: :return: """ sub_topics_id = result['sub_topics_id'] sub_topics_name = result['sub_topics_name'] parent_topic_name = result['parent_topic_name'] parent_topic_id = result['parent_topic_id'] existed = False child_topic_id = '' if parent_topic_id not in self.record_topic_data: self.record_topic_data[parent_topic_id] = parent_topic_name for sub_id,sub_name in zip(sub_topics_id,sub_topics_name): if sub_name == str("加载更多"): existed = True child_topic_id = sub_id continue if sub_id not in self.record_topic_data: self.record_topic_data[sub_id] = sub_name if sub_id not in self.record_topic_link_data: self.record_topic_link_data[sub_id] = [] self.record_topic_link_data[sub_id].append(parent_topic_id) return existed,parent_topic_id,child_topic_id def getSubTopic(self,parent_topic_id='19778317',child_topic_id=''): """ 爬取知乎某一话题下的所有子话题,仅爬取话题下一层(即仅爬取当前话题的孩子话题,孙子不管) :param parent_topic_id: 当前话题 ID :param child_topic_id: 子话题 ID应对显示不全时 :return: """ state = True sub_topics_id = [] while state: url_link = 'https://www.zhihu.com/topic/{0}/organize/entire'.format(parent_topic_id) if child_topic_id: url_link += "?child={}&parent={}".format(child_topic_id, parent_topic_id) result = self.getLinkTopic(url_link) sub_topics_id.extend(result['sub_topics_id']) state,parent_topic_id,child_topic_id = self.recordData(result) return sub_topics_id def getAllTopic(self): """ 爬取队列所有话题及子话题 :return: """ while self.waitting_list: topic_id = self.waitting_list.pop(0) if topic_id in self.havefinished_list: continue sub_topics = self.getSubTopic(parent_topic_id=topic_id) self.waitting_list.extend((list(set(sub_topics)))) self.havefinished_list.append(topic_id) print "当前一获取topic number:{}\t 当前已遍历 topic number: {}"\ .format(len(self.havefinished_list)+len(self.waitting_list),len(self.havefinished_list)) self.writeResulttoFile() def writeResulttoFile(self,topic_file=config.TopicFilePath+"zhihu_topic.json",topic_link_file = config.TopicFilePath+'zhihu_topic_link.json'): with codecs.open(topic_file,'w',encoding='utf8') as topic_fp: json.dump(self.record_topic_data,topic_fp,ensure_ascii=False,encoding='utf8') with codecs.open(topic_link_file,'w',encoding='utf8') as topic_link_fp: json.dump(self.record_topic_link_data,topic_link_fp,ensure_ascii=False,encoding='utf8')
class SpiderTopicData(object): """ 获取知乎数据 """ def __init__(self): """ 初始化相关参数 """ self.login = Login() self.login.login() self.headers = self.login.getHeaders() self.data = self.login.getData() self.session = self.login.getSession() # self.session.encoding = 'utf8' # print self.session. # data link list self.havefinished_list = [] # 已经处理的数据连接 self.waitting_list = [] # 未处理的数据连接 self.record_topic_data = dict() self.record_topic_link_data = dict() def setRootTopic(self, root_topic_id='19778317'): self.waitting_list.append(root_topic_id) def relogin(self): """ 短线重连 :return: """ self.login.login() self.headers = self.login.getHeaders() self.data = self.login.getData() self.session = self.login.getSession() def getLinkTopic( self, link_url='https://www.zhihu.com/topic/19778317/organize/entire?parent=19778317' ): """ 获取连接下的数据 :param link_url: :return: 数据 text """ # self.session try: res = self.session.post(link_url, data=self.data, headers=self.headers) time.sleep(random.randint(4, 12)) except: self.relogin() res = self.session.post(link_url, data=self.data, headers=self.headers) time.sleep(random.randint(4, 12)) topic = json.loads(res.text) # topic = eval(res.text) cur_topic = topic['msg'][0] sub_topics = topic['msg'][1] # for test parent_topic_name = cur_topic[1].encode('utf8') parent_topic_id = cur_topic[2] sub_topics_name = [] sub_topics_id = [] for sub in sub_topics: sub_topics_id.append(sub[0][2]) sub_topics_name.append(sub[0][1].encode('utf8')) result = dict() result["parent_topic_id"] = parent_topic_id result["parent_topic_name"] = parent_topic_name result["sub_topics_name"] = sub_topics_name result["sub_topics_id"] = sub_topics_id return result def recordData(self, result): """ 处理爬取的topic 数据,并记录 :param result: :return: """ sub_topics_id = result['sub_topics_id'] sub_topics_name = result['sub_topics_name'] parent_topic_name = result['parent_topic_name'] parent_topic_id = result['parent_topic_id'] existed = False child_topic_id = '' if parent_topic_id not in self.record_topic_data: self.record_topic_data[parent_topic_id] = parent_topic_name for sub_id, sub_name in zip(sub_topics_id, sub_topics_name): if sub_name == str("加载更多"): existed = True child_topic_id = sub_id continue if sub_id not in self.record_topic_data: self.record_topic_data[sub_id] = sub_name if sub_id not in self.record_topic_link_data: self.record_topic_link_data[sub_id] = [] self.record_topic_link_data[sub_id].append(parent_topic_id) return existed, parent_topic_id, child_topic_id def getSubTopic(self, parent_topic_id='19778317', child_topic_id=''): """ 爬取知乎某一话题下的所有子话题,仅爬取话题下一层(即仅爬取当前话题的孩子话题,孙子不管) :param parent_topic_id: 当前话题 ID :param child_topic_id: 子话题 ID应对显示不全时 :return: """ state = True sub_topics_id = [] while state: url_link = 'https://www.zhihu.com/topic/{0}/organize/entire'.format( parent_topic_id) if child_topic_id: url_link += "?child={}&parent={}".format( child_topic_id, parent_topic_id) result = self.getLinkTopic(url_link) sub_topics_id.extend(result['sub_topics_id']) state, parent_topic_id, child_topic_id = self.recordData(result) return sub_topics_id def getAllTopic(self): """ 爬取队列所有话题及子话题 :return: """ while self.waitting_list: topic_id = self.waitting_list.pop(0) if topic_id in self.havefinished_list: continue sub_topics = self.getSubTopic(parent_topic_id=topic_id) self.waitting_list.extend((list(set(sub_topics)))) self.havefinished_list.append(topic_id) print "当前一获取topic number:{}\t 当前已遍历 topic number: {}"\ .format(len(self.havefinished_list)+len(self.waitting_list),len(self.havefinished_list)) self.writeResulttoFile() def writeResulttoFile(self, topic_file=config.TopicFilePath + "zhihu_topic.json", topic_link_file=config.TopicFilePath + 'zhihu_topic_link.json'): with codecs.open(topic_file, 'w', encoding='utf8') as topic_fp: json.dump(self.record_topic_data, topic_fp, ensure_ascii=False, encoding='utf8') with codecs.open(topic_link_file, 'w', encoding='utf8') as topic_link_fp: json.dump(self.record_topic_link_data, topic_link_fp, ensure_ascii=False, encoding='utf8')