Python ZhihuClient.question Beispiele, zhihu_oauth.ZhihuClient.question Python Beispiele

Beispiel #1

0

Datei anzeigen

def crawling(id):
    #id为问题id
    client = ZhihuClient()
    # 登录
    client.load_token('token.pkl')  # 加载token文件
    question = client.question(id)
    print(u"问题:", question.title)
    print(u"回答数量:", question.answer_count)
    if not os.path.exists(question.title):
        os.mkdir(question.title)
    path = question.title
    index = 1  # 图片序号
    for i, answer in enumerate(question.answers):
        content = answer.content  # 回答内容
        anther = answer.author.name
        re_compile = re.compile(
            r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
        img_lists = re.findall(re_compile, content)
        if (img_lists):
            for img in img_lists:
                img_url = img[0]  # 图片url
                image_name = anther + '_' + str(index) + '.jpg'
                if not os.path.exists(path + '/' + image_name):
                    urllib.request.urlretrieve(img_url,
                                               path + '/' + image_name)
                    print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" %
                          (index, image_name, i / question.answer_count * 100))
                index += 1
        print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))

Beispiel #2

0

Datei anzeigen

    except NeedCaptchaException:
        # 保存验证码并提示输入，重新登录
        print u'登录失败，需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html'  # 将json输出到网页中，chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
question = client.question(question_id)
data = question.pure_data
response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

response_file_uri = './people_response.html'  # 将json输出到网页中，chrome下按F12选preview能看见浏览器渲染出的json数据结构
people_id = '404-Page-Not-found'
people = client.people(people_id)
for i in people.answers:
    data = i.pure_data
    response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

Beispiel #3

0

Datei anzeigen

Datei: Spider.py Projekt: guo602/ZhiHu-Spider

client.load_token('token.pkl')

me = client.me()

# answer = client.answer(94150403)

# print(answer.question.title)
# print(answer.author.name)
# print(answer.voteup_count)
# print(answer.thanks_count)
# print(answer.created_time)
# print(answer.updated_time)

# for voter in answer.voters:
#     print(voter.name, voter.headline)

question_number = [20787350]

for q in question_number:

    index = 0
    question = client.question(q)

    print(question.title)

    for answer in question.answers:
        if index > 666: break
        print(answer.author.name, answer.voteup_count)
        answer.save(question.title)
        index += 1

Beispiel #4

0

Datei anzeigen

class Crawler:
    # Initialize the crawler with the name of database
    def __init__(self, dbname,email,key):
        self.con = sqlite3.connect(dbname)
        self.cursor = self.con.cursor()
        TOKEN_FILE = 'token.pkl'
        self.zhclient = ZhihuClient()
        try:
            # self.zhclient.login_in_terminal(email, key)
            self.zhclient.login(email, key)
        except NeedCaptchaException:
            print("需要输入验证码，账号 %s 可能已失效" %(email))
        # if os.path.isfile(TOKEN_FILE):
        #     self.zhclient.load_token(TOKEN_FILE)
        # else:
        #     self.zhclient.login_in_terminal(email, key)
        #     self.zhclient.save_token(TOKEN_FILE)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    #建立数据表
    def createindextables(self):
        self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)')
        self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)')
        self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)')
        self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)')

        self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)')
        self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)')
        self.cursor.execute('create table user_users(user_id,user_follower_id)')
        self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)')
        self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)')

        self.cursor.execute('create index userinfoidx on userinfo(id)')
        self.cursor.execute('create index answerinfoidx on answerinfo(id)')
        self.cursor.execute('create index questioninfoidx on questioninfo(id)')
        self.cursor.execute('create index topicinfoidx on topicinfo(id)')

        self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)')
        self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)')
        self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)')
        self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)')
        self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)')
        self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)')
        self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)')

        self.dbcommit()

    # #多线程尝试
    # def crawl_data(self,work_set,table1,field1,table2,field2):
    #     if table2 == "userinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.userinfo(subid)
    #     elif table2 == "answerinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.answerinfo(subid)
    #             # time.sleep(0.8)
    #             # time.sleep(0.5)
    #     elif table2 == "questioninfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.questioninfo(subid)
    #     elif table2 == "topicinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topicinfo(subid)
    #     elif table2 == "question_answers":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_answers(subid)
    #     elif table2 == "question_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_topics(subid)
    #     elif table2 == "question_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_users(subid)
    #     elif table2 == "topic_questions":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_questions(subid)
    #     elif table2 == "topic_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_users(subid)
    #     elif table2 == "user_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_users(subid)
    #     elif table2 == "user_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_topics(subid)
    #     return None


    def justdoit(self,table1,field1,table2,field2):
        set2 =set(self.cursor.execute("select DISTINCT  {} from {}".format(field2,table2)).fetchall())
        set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall())
        work_set = set1-set2
        # work_set = list(set1 - set2)
        # splitlen = int(len(work_set) / 2)
        # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)]
        # threads = []
        # for i in range(0,len(subwork_set)):
        #     t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2))
        #     threads.append(t)
        # for t in threads:
        #     t.start()
        #     t.join()
        if table2 == "userinfo":
            for subid in work_set:
                subid = subid[0]
                self.userinfo(subid)
        elif table2 == "answerinfo":
            for subid in work_set:
                subid = subid[0]
                self.answerinfo(subid)
                # time.sleep(1.0)
                time.sleep(0.1)
        elif table2 == "questioninfo":
            for subid in work_set:
                subid = subid[0]
                self.questioninfo(subid)
        elif table2 == "topicinfo":
            for subid in work_set:
                subid = subid[0]
                self.topicinfo(subid)
        elif table2 == "question_answers":
            for subid in work_set:
                subid = subid[0]
                self.question_answers(subid)
        elif table2 == "question_topics":
            for subid in work_set:
                subid = subid[0]
                self.question_topics(subid)
        elif table2 == "question_users":
            for subid in work_set:
                subid = subid[0]
                self.question_users(subid)
        elif table2 == "topic_questions":
            for subid in work_set:
                subid = subid[0]
                self.topic_questions(subid)
        elif table2 == "topic_users":
            for subid in work_set:
                subid = subid[0]
                self.topic_users(subid)
        elif table2 == "user_users":
            for subid in work_set:
                subid = subid[0]
                self.user_users(subid)
        elif table2 == "user_topics":
            for subid in work_set:
                subid = subid[0]
                self.user_topics(subid)
        return None

    #话题-(精华)问题关系
    def topic_questions(self,topic_id):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            ques_set = set()
            for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id)
                if status == None:
                    if hot_ques.question.id not in ques_set:
                        ques_set.add(hot_ques.question.id)
                        values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time)
                        self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理", hot_ques.question.id)
                else:
                    print("已存在，正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            raise
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题-关注者关系
    def topic_users(self,topic_id,start_at = 0):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (topic.id,topic.name,follower.id,follower.name,record_time)
                        self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理",topic.name,follower.name)
                        # time.sleep(0.3)
                else:
                    print("已存在，正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-关注者关系
    def question_users(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(question.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (question.id, question.title, follower.id, follower.name,record_time)
                        self.cursor.execute(
                            "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理",follower.name,question.title)
                else:
                    print("已存在，正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-回答关系
    def question_answers(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            answer_set = set()
            for answer in shield(question.answers):
                status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id)
                if status == None:
                    if answer.id not in answer_set:
                        answer_set.add(answer.id)
                        values = (question.id, question.title, answer.id, answer.author.id,record_time)
                        self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理", question.id, question.title, answer.id, answer.author.id)
                else:
                    print("已存在，正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except ZhihuWarning:
            print("Pass the UnexpectedResponseException")
            pass

    #获取用户-用户关注关系，知乎有5020限制，api限制最多获取一个用户5020粉丝
    def user_users(self,user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(people.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        valus = (people.id,follower.id,record_time)
                        self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus)
                        self.dbcommit()
                        print("正在处理",follower.name)
                else:
                    print("已存在，正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #获取问题-话题关系
    def question_topics(self,question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(question.topics):
                status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (question.id,topic.id,topic.name,record_time)
                        self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values)
                        self.dbcommit()
                        print("正在处理", topic.name,question.title)
                else:
                    print("已存在，正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 获取用户-话题关系
    def user_topics(self, user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(people.following_topics):
                status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (people.id, people.name, topic.id,topic.name, record_time)
                        self.cursor.execute(
                            "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)",
                            values)
                        self.dbcommit()
                        print("正在处理", people.name ,topic.name)
                else:
                    print("已存在，正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 判断数据重复
    def isdupicateid(self, table, id):
        cur = self.cursor.execute(
            "select rowid from {} where id = ?".format(table), (id,))
        self.dbcommit()
        res = cur.fetchone()
        res = None if res == None else res[0]
        return res

    def isdupicaterel(self,table,field1,field2, id1,id2):
        cur = self.cursor.execute(
            "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2))
        res = cur.fetchone()
        self.dbcommit()
        res = None if res == None else res[0]
        return res

    #个人信息
    def userinfo(self,user_id):
        try:
            status = self.isdupicateid("userinfo",user_id)
            if status==None:
                people = self.zhclient.people(user_id)
                record_time = self.logtime()
                address = "|".join([location.name for location in people.locations])
                school_name = "|".join([education.school.name for education in people.educations if "school" in education])
                job = "|".join([employment.job.name for employment in people.employments if "job" in employment])
                company = "|".join([employment.company.name for employment in people.employments if "company" in employment])
                business = people.business.name if people.business else None
                #勋章判断
                if people.badge.has_identity:
                    identity = people.badge.identity
                else:
                    identity = None
                if people.badge.is_best_answerer:
                    best_topics = "".join([topic.name for topic in people.badge.topics])
                else:
                    best_topics = None
                if people.badge.is_organization:
                    is_organization = 1
                    org_name = people.badge.org_name
                    org_home_page = people.badge.org_home_page
                    org_industry = people.badge.org_industry
                else:
                    is_organization = 0
                    org_name = None
                    org_home_page = None
                    org_industry = None
                values = (
                people.id, people.name, people.headline, people.gender, address, business, school_name, job,company,
                people.answer_count, people.question_count, people.voteup_count, people.thanked_count,
                people.following_count, people.follower_count, people.following_question_count,
                people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time)
                self.cursor.execute(
                    "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                    values)
                self.dbcommit()
                print("正在处理", people.name)
            else:
                print("重复，rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass

    def answerinfo(self,answer_id):
        try:
            status = self.isdupicateid("answerinfo", answer_id)
            if status == None:
                answer = self.zhclient.answer(answer_id)
                record_time = self.logtime()
                values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time)
                self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理",answer.id)
            else:
                return ("重复，rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题，方式切换帐号后反复爬去无效问题。
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #问题信息
    def questioninfo(self,question_id):
        try:
            status = self.isdupicateid("questioninfo", question_id)
            if status == None:
                question = self.zhclient.question(question_id)
                record_time = self.logtime()
                values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time)
                self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理" ,question.title)
            else:
                return ("重复，rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题信息
    def topicinfo(self,topic_id):
        try:
            status = self.isdupicateid("topicinfo", topic_id)
            if status == None:
                topic = self.zhclient.topic(topic_id)
                record_time = self.logtime()
                values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time)
                self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理", topic.name)
            else:
                return ("重复，rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #时间戳
    def logtime(self):
        fmt = '%Y-%m-%d'  # 定义时间显示格式
        Date = time.strftime(fmt, time.localtime(time.time()))
        return Date


    def add_counts(self,filepath = "logincounts.txt"):
        counts = []
        for line in open(filepath):
            count = {}
            count["count"], count["key"] = line.split("----")
            count["key"] = count["key"].strip("\n")
            counts.append(count)
        return counts

    def get_proxy(self):
        try:
            PROXY_POOL_URL = 'http://localhost:5000/get'
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                return response.text
        except ConnectionError:
            return None

Beispiel #5

0

Datei anzeigen

Datei: main_ques_ans_users.py Projekt: allen286/zhihu-crawler

import pymysql
from zhihu_oauth import ZhihuClient
from getAnswer import getAnswer
from getUser import getUser

# login
TOKEN_FILE = 'token.pkl'
client = ZhihuClient()
if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# 创建问题对象，参数为问题id
question = client.question(67079761)

if __name__ == '__main__':
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******',
                                 port=3306,
                                 db='zhihu_live',
                                 charset='utf8mb4')
    u_table = str(question.id) + '_user'
    a_table = str(question.id) + '_ans'
    cre_utable = 'create table IF NOT EXISTS %s (uid VARCHAR (50),name VARCHAR (20),gender VARCHAR (10),headline VARCHAR (400),description VARCHAR (1000),que_count INT ,ans_count INT ,art_count INT ,column_ INT ,column_fol_sum INT ,collection INT ,coll_ans_sum INT ,coll_fol_sum INT ,voteup INT ,thanks INT ,collected INT ,shared INT ,art_vote_sum INT ,following INT ,follower INT ,fol_column INT ,fol_topic INT ,fol_topic_name MEDIUMTEXT,fol_ques INT ,location VARCHAR (200),business VARCHAR (50),school VARCHAR (200),major VARCHAR (200),company VARCHAR (200),job VARCHAR (200), avatar VARCHAR (10),avatar_url VARCHAR (100),weibo VARCHAR (10),weibo_name VARCHAR (50),weibo_url VARCHAR (50), give_ans_vote INT, give_art_vote INT, ans_id INT ,que_title VARCHAR (200))' % u_table
    ins_utable = 'insert into ' + u_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    cre_atable = 'create TABLE IF NOT EXISTS %s (ans_ques VARCHAR (200),que_id INT ,ans_id INT ,ans_auth VARCHAR (20),ans_cont MEDIUMTEXT ,ans_vote INT ,ans_than INT ,ans_comm INT ,com_perm VARCHAR (20) ,cre_timestamp VARCHAR (30),upd_timestamp VARCHAR (30),cre_time VARCHAR (30),upd_time VARCHAR (30))' % a_table
    ins_atable = 'insert into ' + a_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'

Beispiel #6

0

Datei anzeigen

Datei: pazhihu2.py Projekt: cash2one/brush-1

# for _ in range(1000):
#     num = random.randint(20000000, 39999999)
for num in [
        40023941, 36582119, 23434853, 37027323, 39124944, 22345285, 26992616,
        28066166, 41035200, 21396519, 35947787, 36851579, 21372989, 47955389,
        37236484, 19861023, 25877081, 27063206, 29166103, 23246914, 38540397,
        36543921, 32158092, 41207814, 41404094, 36734444, 31819473, 29336768,
        32171411, 37184080, 20468104, 36238122, 36573907, 23415802, 30605806,
        37737298, 37059032, 48837193, 48296279, 41053015, 22978737, 22621327,
        42082026, 30470093, 41038770, 21155222, 28489148, 32081129, 32369239,
        30830614, 29213441, 41113819, 36770197, 48831736, 35990525, 48779414,
        22364486, 33032798, 29604768, 21900376, 26500277
]:
    n = 0
    try:
        question = client.question(num)
        # question = client.from_url('https://www.zhihu.com/question/35166763')
        print(question.title)
        with open("name.txt", 'a', encoding='utf-8') as f:
            for answer in question.answers:
                n += 1
                try:
                    if answer.author.name != "匿名用户" and answer.author.name != "[已重置]":
                        print(answer.author.name)
                        f.write(answer.author.name + '\n')
                except:
                    pass
                print(n)
    except:
        print("空")

Beispiel #7

0

Datei anzeigen

    35367500, 35210878, 35134422, 35062190, 35012924, 35004585, 34893663,
    34370944, 34225657, 33488763, 33259890, 32207070, 31592568, 31365240,
    31337752, 30966406, 30158223, 29735498, 29582607, 29550579, 29525971,
    29519716, 29518811, 29511036, 29508808, 29448162, 27255630, 25951351,
    23863606, 19930380
]
qid_run = qid_remaining[200:]

rows = []
fail_qid = []

counter = 1

for qid in qid_run:
    try:
        ans = client.question(int(qid)).answers
        for v in ans:
            rows.append((v._id, qid, v.voteup_count, v.comment_count))
        print("success {0}".format(counter))
        counter += 1
    except:
        # if crawl failed, append qid to list fail_qid[]
        fail_qid.append(qid)
        print("fail {0}".format(counter))
        counter += 1
        print(fail_qid)
        continue

output_file = "./answers.csv"
headers = ["aid", "qid", "voteup_count", "comment_count"]
with open(output_file, 'a') as f:

Beispiel #8

0

Datei anzeigen

Datei: oauth_test.py Projekt: EleVenPerfect/OTHERS

    except NeedCaptchaException:
        # 保存验证码并提示输入，重新登录
        print u'登录失败，需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html' # 将json输出到网页中，chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
question = client.question(question_id)
data = question.pure_data
response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

response_file_uri = './people_response.html' # 将json输出到网页中，chrome下按F12选preview能看见浏览器渲染出的json数据结构
people_id = '404-Page-Not-found'
people = client.people(people_id)
for i in people.answers:
    data = i.pure_data
    response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

Beispiel #9

0

Datei anzeigen

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
    print('login success!')
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# # 回答信息
# answer = client.answer(94150403)
#
# print(answer.question.title)
# print(answer.author.name)
# print(answer.voteup_count)
# print(answer.thanks_count)
# print(answer.created_time)
# print(answer.updated_time)
#
# for voter in answer.voters:
#     print(voter.name, voter.headline)

question = client.question(35166763)

print(question.title)

count = 0
for answer in question.answers:
    answer.save(r'Data\Answers\\' + question.title)
    count += 1
    if count == 10:
        break

Beispiel #10

0

Datei anzeigen

class ZhiHu(object):
    TOKEN_FILE = 'token.pkl'

    def __init__(self):
        """
        初始化
        """
        self.login_zhihu()
        self.db = EasySqlite('zhihu.db')

    def login_zhihu(self):
        """
        登录知乎
        :return:
        """
        self.client = ZhihuClient()
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def save_quesions(self, topic_id):
        """
        保存话题下的问题
        :param topic_id:
        :return:
        """
        topic = self.client.topic(topic_id)
        print(topic)
        questions = topic.unanswered_questions
        sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        for question in questions:
            if question.answer_count < 10:
                continue
            row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
                   topic_id]
            print(row)
            ret = self.db.update(sql_tmp, args=row)
            if not ret:
                print('insert error!')
            else:
                print('insert success!')

    def save_answer_info(self, question_id):
        """
        保存指定问题的答案概况
        :param question_id:
        :return:
        """
        question = self.client.question(question_id)
        print(question.title)
        answers = question.answers
        for answer in answers:
            print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count,
                  answer.voteup_count)
            answer.save()
            break
        # sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        # for question in questions:
        #     if question.answer_count < 10:
        #         continue
        #     row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
        #            topic_id]
        #     print(row)
        #     ret = self.db.update(sql_tmp, args=row)
        #     if not ret:
        #         print('insert error!')
        #     else:
        #         print('insert success!')

    def to_md(self, topic, file_name):
        sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic
        ret = self.db.query(sql)
        line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数：%s 回答数：%s 评论数：%s<br>\n"
        i = 1
        with open(file_name, 'w', encoding='utf8') as f:
            for item in ret:
                line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count'])
                f.write(line)
                i += 1

Beispiel #11

0

Datei anzeigen

path = r"D:\Kuangyichen\Repository_py3\Zhihu\Data\Gene"
download = [int(i.split('#')[0]) for i in os.listdir(path)]

out = set(questions) - set(download)

print(len(out))
for i in out:
    print(i)

    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)
    path = r'D:\Kuangyichen\Repository_py3\Zhihu\Data\lefted'
    questions = []
    with open(path, 'r', encoding='UTF8') as Reader:
        for line in Reader.readlines():
            questions.append(int(line))
    for q in questions:
        question_t = client.question(q)
        print(str(q) + "start")
        for answer in question_t.answers:
            print(answer.author.id, answer.author.name)
            answer.save(
                'Data\\Gene\\' + str(question_t.id) + '#' + question_t.title,
                str(answer.author.id) + '#' + answer.author.name)
        print(str(q) + "end")

Beispiel #12

0

Datei anzeigen



client = ZhihuClient()
#登录部分
try:
    client.login(ZHIHU_ID, ZHIHU_KEY)
except NeedCaptchaException:
    # 保存验证码并提示输入，重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(ZHIHU_ID, ZHIHU_KEY, captcha)


the_question = client.question(QUESTION_ID)


print(the_question.title)



a = 0


with open('all_answers.txt', 'w') as f_txt:
    f_txt.write("昵称&用户关注数&用户粉丝数&用户回答数&用户获赞数&用户获得感谢数&用户性别&用户学校&用户学院&回答内容（去标点符号）&回答日期&最后修改日期&赞数&感谢数&评论数&是否允许评论&是否被建议修改&回答可信度指数&情感分析积极性&情感分析消极性&情感倾向&回答内容")
    for the_answer in the_question.answers:

        the_author = the_answer.author
        author_name = the_author.name

Beispiel #13

0

Datei anzeigen

Datei: 知乎-查询问题.py Projekt: alicewish/Python-Learning

from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ========================查询问题========================
qid = 48217184
question = client.question(qid)
print('允许删除', question.allow_delete)
print('答案数', question.answer_count)
print('答案', question.answers)
print('评论数', question.comment_count)
print('评论', question.comments)
print('细节', question.detail)
print('摘录', question.excerpt)
print('关注数', question.follower_count)
print('关注人', question.followers)
print('问题ID', question.id)
print('重定向', question.redirection)
print('状态', question.status)
print('建议修改', question.suggest_edit)
print('标题', question.title)
print('话题', question.topics)
print('更新时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(question.updated_time)))

Beispiel #14

0

Datei anzeigen

Datei: zhi-answer.py Projekt: canvas-J/script_crawler

TOKEN_FILE = 'token.pkl'
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    try:
        client.login('email_or_phone', 'password')
    except NeedCaptchaException:
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('email_or_phone', 'password', captcha)
    client.save_token(TOKEN_FILE)

question = client.question(int(question_id))
print(question.title)
wb = Workbook()
sheet = wb.active
sheet.title = "知乎"
item_name = [
    'time_now', 'content', 'author', 'gender', 'loc', 'business', 'company',
    'job', 'created_time', 'updated_time', 'voteup_count', 'comment_count',
    'thanks_count'
]
for j, title in enumerate(item_name):
    sheet.cell(row=1, column=j + 1).value = title
num = 0
for answer in question.answers:
    num += 1
    item_data = [datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')]

Beispiel #15

0

Datei anzeigen

Datei: pic_one.py Projekt: wat1r/PythonTestOne

# @Email  : [email protected]
# @File   : save_images.py
'''
@Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function  # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib.request

client = ZhihuClient()
# 登录
client.load_token('token.pkl')  # 加载token文件
id = 24400664  # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验)
question = client.question(id)
print(u"问题:", question.title)
print(u"回答数量:", question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title + u"(图片)")
path = question.title + u"(图片)"
index = 1  # 图片序号
for answer in question.answers:
    content = answer.content  # 回答内容
    re_compile = re.compile(
        r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
    img_lists = re.findall(re_compile, content)
    if (img_lists):
        for img in img_lists:
            img_url = img[0]  # 图片url
            urllib.request.urlretrieve(img_url, path + u"/%d.jpg" % index)

Beispiel #16

0

Datei anzeigen

Datei: ZhihuSpider.py Projekt: jinhuiphy/Spider

Zhihu = dbClient['Zhihu']
ZhihuData = Zhihu[str(questionID)]
if ZhihuData.find():
        ZhihuData.remove({})
# 登陆知乎账号
client = ZhihuClient()
try:
    client.login(account, passwd)
except NeedCaptchaException:
    # 保存验证码并提示输入，重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(account, passwd, captcha)
# 创建问题对象
question = client.question(questionID)
# 读取问题下所有的回答并保存起来
print(question.title)
count = 0
for answer in question.answers:
    count+=1
    try:
        data = {
            'title':question.title,
            'author':answer.author.name,
            'description':answer.author.description,
            'content':answer.content,
            'voteup':answer.voteup_count,
            'thanks':answer.thanks_count
        }
        print("正在保存第%s个回答" %count)

Beispiel #17

0

Datei anzeigen

# tableNum = mysql.cur.execute("SELECT concat('DROP TABLE IF EXISTS ', table_name, ';') ""FROM information_schema.tables WHERE table_schema = 'urldata';")
# if tableNum == 0:
#     print("该数据库下所有表删除完毕\n---------------------------------------")
# else:
#     print("删除失败")
#
# 在数据库(urldata)中新建表
try:
    mysql.cur.execute(
        "create table answer(answer_id int(4),author_name varchar(40),author_healine varchar(40),agree_num int(4),comment_num int(4),thanks_count int(4), url varchar(100))"
    )
    mysql.cur.execute(
        "create table comments(currentanswer_id int(4),commentID int(4), commentpersonName varchar(40), words varchar(300))"
    )
    # 爬取知乎热榜第一的所有回答//5G发放牌照
    question = client.question(328058110)
    for answer in question.answers:
        try:
            mysql.cur.execute(
                "insert into answer values(%d,'%s','%s',%d,%d,%d,'%s')" %
                (answer.id, answer.author.name, answer.author.headline,
                 answer.voteup_count, answer.comment_count,
                 answer.thanks_count, answer._build_url()))
        except BaseException:
            print("a answer is nelected")
    question = client.question(328058110)
    for answer in question.answers:
        print(answer.pure_data)
    # 爬取知乎热榜所有评论
    question = client.question(328058110)
    for answer in question.answers:

Beispiel #18

0

Datei anzeigen

Datei: question_test.py Projekt: allen286/zhihu-crawler

def time2str(timeStamp):
    timeArray = time.localtime(timeStamp)
    return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)


def ucps2str(ucpstr):
    '''Convert unicode code point (in hex) ascii string to unicode string'''
    s = ''
    for i in range(len(ucpstr) / 4):
        ucp = ucpstr[i * 4:i * 4 + 4]
        s = s + unichr(int(ucp, 16))
    return s


question = client.question(20840874)#哪些东西买了之后，会让人因生活质量和幸福感提升而感觉相见恨晚？

# 通过question类的answers这个生成器属性可以获取到每个回答的author（answer.author类），进而过去回答者的档案信息
# for answer in question.answers:
#     print(answer.author.name+ str(answer.author.answer_count) + ' ' + str(answer.voteup_count))
#     print(answer.author.id)  #为什么id是这样的？10fc5f92b8f7f7cd1a058d10a0f36ce0

# for answer in question.answers:
#     # print('id',ucps2str(answer.author.id),answer.author.id.decode('hex'),binascii.unhexlify(answer.author.id))
#     print('id', answer.author.i)
#     print('uid',answer.author.uid)
#     print('name', answer.author.name)
#     print('gender', answer.author.gender)
#     print('headline', answer.author.headline)
#     print('description', answer.author.description)
#     print('\n')