def get_response(self, offset, comment_id):

        url = f"https://www.zhihu.com/api/v4/comments/{comment_id}/child_comments"
        self.querystring = {"limit": "20", "offset": offset}
        account = zhihu_login.ZhihuAccount("", "")
        account.login(captcha_lang="en", load_cookies=True)
        response = account.session.get(url, params=self.querystring)
        response.encoding = response.apparent_encoding
        return response
 def get_response(self, offset, question_id):
     url = f"https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics"
     self.querystring = {
         # "include": "data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics",
         "limit": "5",
         "offset": offset,
         "sort_by": "default"
     }
     account = zhihu_login.ZhihuAccount("", "")
     account.login(captcha_lang="en", load_cookies=True)
     response = account.session.get(url, params=self.querystring)
     response.encoding = response.apparent_encoding
     return response
 def get_response(self, offset, topic_id):
     url = f'https://www.zhihu.com/api/v4/topics/{topic_id}/feeds/timeline_question'
     self.querystring = {
         "include":
         "data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;",
         "limit": "10",
         "offset": offset,
     }
     account = zhihu_login.ZhihuAccount('', '')
     account.login(captcha_lang='en', load_cookies=True)
     response = account.session.get(url, params=self.querystring)
     response.encoding = response.apparent_encoding
     return response
Exemple #4
0
 def get_response(self, search_key, offset):
     url = 'https://www.zhihu.com/api/v4/search_v3'
     self.querystring = {
         "t": "topic",
         "q": search_key,  # 需要什么话题只需要把参数传到这里就行
         "correction": "1",
         "offset": offset,
         "limit": "20",
         "show_all_topics": "1",
     }
     account = zhihu_login.ZhihuAccount('', '')
     account.login(captcha_lang='en', load_cookies=True)
     response = account.session.get(url, params=self.querystring)
     response.encoding = response.apparent_encoding
     return response
 def __init__(self, file_path, offset_i, question_id):
     account = zhihu_login.ZhihuAccount("", "")
     account.login(captcha_lang="en", load_cookies=True)
     response = self.get_response(0, question_id=question_id)
     dict_data = self.json_to_dict(response.text)
     result_list = []
     i = offset_i
     url, self.result_list = self.get_result(dict_data, result_list)
     while url:
         response = self.get_response(i, question_id=question_id)
         print(i)
         dict_data = self.json_to_dict(response.text)
         # print(dict_data)
         url, result_list = self.get_result(dict_data, self.result_list)
         i = i + offset_i
         self.save_to_excel(file_path)
Exemple #6
0
 def __init__(self, search_key, file_path, offset_i):
     account = zhihu_login.ZhihuAccount('', '')
     account.login(captcha_lang='en', load_cookies=True)
     response = self.get_response(search_key, 0)
     dict_data = self.json_to_dict(response.text)
     result_list = []
     i = offset_i
     url, self.result_list = self.get_result(dict_data, result_list)
     while url:
         response = self.get_response(search_key, i)
         print(i)
         # f = open('zhihu.html', 'w', encoding='utf-8')
         # f.write(response.text)
         # f2 = open('www.txt', 'w')
         # f2.write(url)
         dict_data = self.json_to_dict(response.text)
         print(dict_data)
         url, result_list = self.get_result(dict_data, self.result_list)
         i = i + offset_i
         self.save_to_excel(file_path)
def get_pins() -> NoReturn:
    # 获取最近一个月的最近 10 条想法
    account = zhihu_login.ZhihuAccount(private_config.username,
                                       private_config.password)
    if account.login():
        begin_date = datetime.date.today() - datetime.timedelta(days=30)
        end_date = datetime.date.today()
        pins = account.session.get(
            f'https://www.zhihu.com/api/v4/creator/content_statistics/pins?'
            f'begin_date={begin_date}&end_date={end_date}').json()['data']
        for pin in pins:
            response = account.session.get(
                f'https://www.zhihu.com/pin/{pin["url_token"]}')
            html = etree.HTML(response.text)
            content = html.xpath(
                '//*[@id="root"]/div/main/div/div/div[2]/div[1]/span'
            )[0].xpath('string(.)')
            pin['content'] = content
            print(pin)
            p = Pin(**pin)
            p.save()
def get_articles() -> NoReturn:
    # 获取最近一个月的最近 10 篇文章
    account = zhihu_login.ZhihuAccount(private_config.username,
                                       private_config.password)
    if account.login():
        begin_date = datetime.date.today() - datetime.timedelta(days=30)
        end_date = datetime.date.today()
        articles = account.session.get(
            f'https://www.zhihu.com/api/v4/creator/content_statistics/articles?'
            f'begin_date={begin_date}&end_date={end_date}').json()['data']
        account.session.headers['Host'] = 'zhuanlan.zhihu.com'
        for article in articles:
            response = account.session.get(
                f'https://zhuanlan.zhihu.com/p/{article["url_token"]}')
            html = etree.HTML(response.text)
            content = html.xpath(
                '//*[@id="root"]/div/main/div/article/div[1]/div')[0].xpath(
                    'string(.)')
            article['content'] = content
            print(article)
            a = Article(**article)
            a.save()
Exemple #9
0

def restart_program():
    """Restarts the current program.
    Note: this function does not return. Any cleanup action (like
    saving data) must be done before calling this function."""
    python = sys.executable
    os.execl(python, python, *sys.argv)


if __name__ == "__main__":
    isold = input(
        'Would you like to use the last login cookies? (yes/no)Default=yes\n')
    if 'n' in isold:
        load_cookies = False
        whether, session = zhihu_login.ZhihuAccount('', '').login(
            'en', load_cookies)
    else:
        load_cookies = True
        whether, session = zhihu_login.ZhihuAccount('', '').login(
            'en', load_cookies)
    print('login ' + str(whether) + ' ' + str(session))
    #main_from_me(session)
    try:
        targetname = valprivate['target_name']
    except KeyError:
        targetname = val['target_name']
    num_url = 0
    while num_url < utils.targetindex:
        main_from_enter(session, num_url, targetname[num_url])
        print(str(utils.theusingua))
        print('the proxy which was chosed is ' + str(utils.chooseproxy))
"""
用于爬取问题关注数,被浏览数,问题的本体和内容,问题的提出者
"""

from lxml import etree
import pandas

import zhihu_login

question_id = "35670502"
f = open('question.html', 'w', encoding='utf-8')
result_list = []
one_info = {}
account = zhihu_login.ZhihuAccount('', '')
account.login(captcha_lang='en', load_cookies=True)

url = f"https://www.zhihu.com/question/{question_id}"
response = account.session.get(url)
f.write(response.text)
html = etree.parse('question.html', etree.HTMLParser())
result = html.xpath('//*[@class="NumberBoard-itemValue"]/@title')
one_info["关注者"] = result[0]
one_info["被浏览"] = result[1]
print('关注者:' + result[0])  # 关注者
print('被浏览:' + result[1])  # 被浏览

url = "https://zhihu-web-analytics.zhihu.com/api/v2/za/logs/batch"
response = account.session.post(url)
f.write(response.text)
html = etree.parse('question.html', etree.HTMLParser())
result = html.xpath('//*[@class="QuestionHeader-title"]/text()')