def get_response(self, offset, comment_id): url = f"https://www.zhihu.com/api/v4/comments/{comment_id}/child_comments" self.querystring = {"limit": "20", "offset": offset} account = zhihu_login.ZhihuAccount("", "") account.login(captcha_lang="en", load_cookies=True) response = account.session.get(url, params=self.querystring) response.encoding = response.apparent_encoding return response
def get_response(self, offset, question_id): url = f"https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics" self.querystring = { # "include": "data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics", "limit": "5", "offset": offset, "sort_by": "default" } account = zhihu_login.ZhihuAccount("", "") account.login(captcha_lang="en", load_cookies=True) response = account.session.get(url, params=self.querystring) response.encoding = response.apparent_encoding return response
def get_response(self, offset, topic_id): url = f'https://www.zhihu.com/api/v4/topics/{topic_id}/feeds/timeline_question' self.querystring = { "include": "data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;", "limit": "10", "offset": offset, } account = zhihu_login.ZhihuAccount('', '') account.login(captcha_lang='en', load_cookies=True) response = account.session.get(url, params=self.querystring) response.encoding = response.apparent_encoding return response
def get_response(self, search_key, offset): url = 'https://www.zhihu.com/api/v4/search_v3' self.querystring = { "t": "topic", "q": search_key, # 需要什么话题只需要把参数传到这里就行 "correction": "1", "offset": offset, "limit": "20", "show_all_topics": "1", } account = zhihu_login.ZhihuAccount('', '') account.login(captcha_lang='en', load_cookies=True) response = account.session.get(url, params=self.querystring) response.encoding = response.apparent_encoding return response
def __init__(self, file_path, offset_i, question_id): account = zhihu_login.ZhihuAccount("", "") account.login(captcha_lang="en", load_cookies=True) response = self.get_response(0, question_id=question_id) dict_data = self.json_to_dict(response.text) result_list = [] i = offset_i url, self.result_list = self.get_result(dict_data, result_list) while url: response = self.get_response(i, question_id=question_id) print(i) dict_data = self.json_to_dict(response.text) # print(dict_data) url, result_list = self.get_result(dict_data, self.result_list) i = i + offset_i self.save_to_excel(file_path)
def __init__(self, search_key, file_path, offset_i): account = zhihu_login.ZhihuAccount('', '') account.login(captcha_lang='en', load_cookies=True) response = self.get_response(search_key, 0) dict_data = self.json_to_dict(response.text) result_list = [] i = offset_i url, self.result_list = self.get_result(dict_data, result_list) while url: response = self.get_response(search_key, i) print(i) # f = open('zhihu.html', 'w', encoding='utf-8') # f.write(response.text) # f2 = open('www.txt', 'w') # f2.write(url) dict_data = self.json_to_dict(response.text) print(dict_data) url, result_list = self.get_result(dict_data, self.result_list) i = i + offset_i self.save_to_excel(file_path)
def get_pins() -> NoReturn: # 获取最近一个月的最近 10 条想法 account = zhihu_login.ZhihuAccount(private_config.username, private_config.password) if account.login(): begin_date = datetime.date.today() - datetime.timedelta(days=30) end_date = datetime.date.today() pins = account.session.get( f'https://www.zhihu.com/api/v4/creator/content_statistics/pins?' f'begin_date={begin_date}&end_date={end_date}').json()['data'] for pin in pins: response = account.session.get( f'https://www.zhihu.com/pin/{pin["url_token"]}') html = etree.HTML(response.text) content = html.xpath( '//*[@id="root"]/div/main/div/div/div[2]/div[1]/span' )[0].xpath('string(.)') pin['content'] = content print(pin) p = Pin(**pin) p.save()
def get_articles() -> NoReturn: # 获取最近一个月的最近 10 篇文章 account = zhihu_login.ZhihuAccount(private_config.username, private_config.password) if account.login(): begin_date = datetime.date.today() - datetime.timedelta(days=30) end_date = datetime.date.today() articles = account.session.get( f'https://www.zhihu.com/api/v4/creator/content_statistics/articles?' f'begin_date={begin_date}&end_date={end_date}').json()['data'] account.session.headers['Host'] = 'zhuanlan.zhihu.com' for article in articles: response = account.session.get( f'https://zhuanlan.zhihu.com/p/{article["url_token"]}') html = etree.HTML(response.text) content = html.xpath( '//*[@id="root"]/div/main/div/article/div[1]/div')[0].xpath( 'string(.)') article['content'] = content print(article) a = Article(**article) a.save()
def restart_program(): """Restarts the current program. Note: this function does not return. Any cleanup action (like saving data) must be done before calling this function.""" python = sys.executable os.execl(python, python, *sys.argv) if __name__ == "__main__": isold = input( 'Would you like to use the last login cookies? (yes/no)Default=yes\n') if 'n' in isold: load_cookies = False whether, session = zhihu_login.ZhihuAccount('', '').login( 'en', load_cookies) else: load_cookies = True whether, session = zhihu_login.ZhihuAccount('', '').login( 'en', load_cookies) print('login ' + str(whether) + ' ' + str(session)) #main_from_me(session) try: targetname = valprivate['target_name'] except KeyError: targetname = val['target_name'] num_url = 0 while num_url < utils.targetindex: main_from_enter(session, num_url, targetname[num_url]) print(str(utils.theusingua)) print('the proxy which was chosed is ' + str(utils.chooseproxy))
""" 用于爬取问题关注数,被浏览数,问题的本体和内容,问题的提出者 """ from lxml import etree import pandas import zhihu_login question_id = "35670502" f = open('question.html', 'w', encoding='utf-8') result_list = [] one_info = {} account = zhihu_login.ZhihuAccount('', '') account.login(captcha_lang='en', load_cookies=True) url = f"https://www.zhihu.com/question/{question_id}" response = account.session.get(url) f.write(response.text) html = etree.parse('question.html', etree.HTMLParser()) result = html.xpath('//*[@class="NumberBoard-itemValue"]/@title') one_info["关注者"] = result[0] one_info["被浏览"] = result[1] print('关注者:' + result[0]) # 关注者 print('被浏览:' + result[1]) # 被浏览 url = "https://zhihu-web-analytics.zhihu.com/api/v2/za/logs/batch" response = account.session.post(url) f.write(response.text) html = etree.parse('question.html', etree.HTMLParser()) result = html.xpath('//*[@class="QuestionHeader-title"]/text()')