Beispiel #1
0
def getLatestBestAnserwerAndSave():
    # phoneNum = '+8613096348217'
    # pw = '2015141463222'

    ans_num = 20
    i=0


    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)

    # try:
    #     client.login(phoneNum, pw)
    # except NeedCaptchaException:
    #     # 保存验证码并提示输入,重新登录
    #     with open('a.gif', 'wb') as f:
    #         f.write(client.get_captcha())
    #     captcha = input('please input captcha:')
    #     client.login(phoneNum, pw, captcha)

    java = client.topic(19550867)
    BA = java.best_answers
    for answ in BA:
        ansItem2artical(ansItem(answ)).save()
        i = i+1

        if i==ans_num:
            break
# print('关注人数', topic.follower_count)
# print('关注人', topic.followers)
# print('关注人数', topic.followers_count)
# print('话题ID', topic.id)
# print('介绍', topic.introduction)
# print('名称', topic.name)
# print('父话题数', topic.parent_count)
# print('父话题详情', topic.parents)
# print('已回答问题数', topic.question_count)
# print('已回答问题个数', topic.questions_count)
# print('未回答问题数', topic.unanswered_count)
# print('未回答问题', topic.unanswered_questions)

# ==================查询话题下所有未回答问题==================
tid = 19668865
topic = client.topic(tid)
question_line = []
for question in topic.unanswered_questions:
    entry_start_time = time.time()
    allow_delete = question.allow_delete
    answer_count = question.answer_count
    answers = question.answers
    comment_count = question.comment_count
    comments = question.comments
    detail = question.detail
    excerpt = question.excerpt
    follower_count = question.follower_count
    followers = question.followers
    id = question.id
    redirection = question.redirection
    status = question.status
Beispiel #3
0
# print('关注人数', topic.follower_count)
# print('关注人', topic.followers)
# print('关注人数', topic.followers_count)
# print('话题ID', topic.id)
# print('介绍', topic.introduction)
# print('名称', topic.name)
# print('父话题数', topic.parent_count)
# print('父话题详情', topic.parents)
# print('已回答问题数', topic.question_count)
# print('已回答问题个数', topic.questions_count)
# print('未回答问题数', topic.unanswered_count)
# print('未回答问题', topic.unanswered_questions)

# ==================查询话题下所有未回答问题==================
tid = 19668865
topic = client.topic(tid)
question_line = []
for question in topic.unanswered_questions:
    entry_start_time = time.time()
    allow_delete = question.allow_delete
    answer_count = question.answer_count
    answers = question.answers
    comment_count = question.comment_count
    comments = question.comments
    detail = question.detail
    excerpt = question.excerpt
    follower_count = question.follower_count
    followers = question.followers
    id = question.id
    redirection = question.redirection
    status = question.status
Beispiel #4
0
                items2['thanks count'] = str(answer.thanks_count)

                items2['updated time'] = answer_ut

                answer_numbers += 1

                items['answer' + str(a)] = items2
                a += 1

            # print('------------------------')
            f.write(json.dumps(items, indent=2, ensure_ascii=False))

    return answer_numbers


# # Main topic
answer_numbers_all = 0
topic_id = 21239580  #新型冠状肺炎的话题id
#topic_id = 21238418  #新型冠状病毒的话题id
topic = client.topic(topic_id)
topic_children = topic.children

answer_numbers_all += save_answer(topic, answer_numbers_all)
print('answer_numbers_all: ', answer_numbers_all)
for topic_child in topic_children:
    answer_numbers_all += save_answer(topic_child, answer_numbers_all)
    print('answer_numbers_all: ', answer_numbers_all)

print('answer numbers :', answer_numbers_all)
Beispiel #5
0
TOKEN_FILE = 'token.pkl'
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    try:
        client.login('email_or_phone', 'password')
    except NeedCaptchaException:
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('email_or_phone', 'password', captcha)
    client.save_token(TOKEN_FILE)

topic = client.topic(int(topic_id))
print(topic.name)

#日志设置
logging.basicConfig(level=logging.ERROR,  
                format='%(asctime)s %(levelname)s %(message)s',  
                datefmt='%Y-%m-%d %H:%M:%S',
                filename='zhi.log',
                filemode='w')

if os.path.exists('知乎-{}.xlsx'.format(file_name)):
    queue = pickle.load(open("queue.pkl", "rb"))
    wb = load_workbook('知乎-{}.xlsx'.format(file_name))
    sheet = wb.active
    data_rows = sheet.max_row - 1
    print("上次进度已加载!")
Beispiel #6
0
import os
from zhihu_oauth import ZhihuClient

TOKEN_FILE = 'token.pkl'

#login
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

#test topic class
topic = client.topic(19551275)
print(topic.followers_count)
print(topic.best_answers_count)
for fol in topic.followers:
    print(fol.id, fol.name)

#没有提供话题关注者接口,只能获取关注者数目,不能得到具体每位关注者
#最佳回答者无法获取
# print(type(topic.best_answerers))
# for answerer in topic.best_answerers:
#     print(answerer.id)
# 报错信息如下
# return People(data['id'], data, self._session)
# KeyError: u'id'
# 为什么呢
Beispiel #7
0
class Crawler:
    # Initialize the crawler with the name of database
    def __init__(self, dbname,email,key):
        self.con = sqlite3.connect(dbname)
        self.cursor = self.con.cursor()
        TOKEN_FILE = 'token.pkl'
        self.zhclient = ZhihuClient()
        try:
            # self.zhclient.login_in_terminal(email, key)
            self.zhclient.login(email, key)
        except NeedCaptchaException:
            print("需要输入验证码,账号 %s 可能已失效" %(email))
        # if os.path.isfile(TOKEN_FILE):
        #     self.zhclient.load_token(TOKEN_FILE)
        # else:
        #     self.zhclient.login_in_terminal(email, key)
        #     self.zhclient.save_token(TOKEN_FILE)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    #建立数据表
    def createindextables(self):
        self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)')
        self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)')
        self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)')
        self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)')

        self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)')
        self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)')
        self.cursor.execute('create table user_users(user_id,user_follower_id)')
        self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)')
        self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)')

        self.cursor.execute('create index userinfoidx on userinfo(id)')
        self.cursor.execute('create index answerinfoidx on answerinfo(id)')
        self.cursor.execute('create index questioninfoidx on questioninfo(id)')
        self.cursor.execute('create index topicinfoidx on topicinfo(id)')

        self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)')
        self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)')
        self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)')
        self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)')
        self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)')
        self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)')
        self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)')

        self.dbcommit()

    # #多线程尝试
    # def crawl_data(self,work_set,table1,field1,table2,field2):
    #     if table2 == "userinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.userinfo(subid)
    #     elif table2 == "answerinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.answerinfo(subid)
    #             # time.sleep(0.8)
    #             # time.sleep(0.5)
    #     elif table2 == "questioninfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.questioninfo(subid)
    #     elif table2 == "topicinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topicinfo(subid)
    #     elif table2 == "question_answers":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_answers(subid)
    #     elif table2 == "question_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_topics(subid)
    #     elif table2 == "question_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_users(subid)
    #     elif table2 == "topic_questions":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_questions(subid)
    #     elif table2 == "topic_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_users(subid)
    #     elif table2 == "user_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_users(subid)
    #     elif table2 == "user_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_topics(subid)
    #     return None


    def justdoit(self,table1,field1,table2,field2):
        set2 =set(self.cursor.execute("select DISTINCT  {} from {}".format(field2,table2)).fetchall())
        set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall())
        work_set = set1-set2
        # work_set = list(set1 - set2)
        # splitlen = int(len(work_set) / 2)
        # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)]
        # threads = []
        # for i in range(0,len(subwork_set)):
        #     t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2))
        #     threads.append(t)
        # for t in threads:
        #     t.start()
        #     t.join()
        if table2 == "userinfo":
            for subid in work_set:
                subid = subid[0]
                self.userinfo(subid)
        elif table2 == "answerinfo":
            for subid in work_set:
                subid = subid[0]
                self.answerinfo(subid)
                # time.sleep(1.0)
                time.sleep(0.1)
        elif table2 == "questioninfo":
            for subid in work_set:
                subid = subid[0]
                self.questioninfo(subid)
        elif table2 == "topicinfo":
            for subid in work_set:
                subid = subid[0]
                self.topicinfo(subid)
        elif table2 == "question_answers":
            for subid in work_set:
                subid = subid[0]
                self.question_answers(subid)
        elif table2 == "question_topics":
            for subid in work_set:
                subid = subid[0]
                self.question_topics(subid)
        elif table2 == "question_users":
            for subid in work_set:
                subid = subid[0]
                self.question_users(subid)
        elif table2 == "topic_questions":
            for subid in work_set:
                subid = subid[0]
                self.topic_questions(subid)
        elif table2 == "topic_users":
            for subid in work_set:
                subid = subid[0]
                self.topic_users(subid)
        elif table2 == "user_users":
            for subid in work_set:
                subid = subid[0]
                self.user_users(subid)
        elif table2 == "user_topics":
            for subid in work_set:
                subid = subid[0]
                self.user_topics(subid)
        return None

    #话题-(精华)问题关系
    def topic_questions(self,topic_id):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            ques_set = set()
            for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id)
                if status == None:
                    if hot_ques.question.id not in ques_set:
                        ques_set.add(hot_ques.question.id)
                        values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time)
                        self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理", hot_ques.question.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            raise
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题-关注者关系
    def topic_users(self,topic_id,start_at = 0):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (topic.id,topic.name,follower.id,follower.name,record_time)
                        self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理",topic.name,follower.name)
                        # time.sleep(0.3)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-关注者关系
    def question_users(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(question.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (question.id, question.title, follower.id, follower.name,record_time)
                        self.cursor.execute(
                            "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理",follower.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-回答关系
    def question_answers(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            answer_set = set()
            for answer in shield(question.answers):
                status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id)
                if status == None:
                    if answer.id not in answer_set:
                        answer_set.add(answer.id)
                        values = (question.id, question.title, answer.id, answer.author.id,record_time)
                        self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理", question.id, question.title, answer.id, answer.author.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except ZhihuWarning:
            print("Pass the UnexpectedResponseException")
            pass

    #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝
    def user_users(self,user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(people.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        valus = (people.id,follower.id,record_time)
                        self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus)
                        self.dbcommit()
                        print("正在处理",follower.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #获取问题-话题关系
    def question_topics(self,question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(question.topics):
                status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (question.id,topic.id,topic.name,record_time)
                        self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values)
                        self.dbcommit()
                        print("正在处理", topic.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 获取用户-话题关系
    def user_topics(self, user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(people.following_topics):
                status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (people.id, people.name, topic.id,topic.name, record_time)
                        self.cursor.execute(
                            "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)",
                            values)
                        self.dbcommit()
                        print("正在处理", people.name ,topic.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 判断数据重复
    def isdupicateid(self, table, id):
        cur = self.cursor.execute(
            "select rowid from {} where id = ?".format(table), (id,))
        self.dbcommit()
        res = cur.fetchone()
        res = None if res == None else res[0]
        return res

    def isdupicaterel(self,table,field1,field2, id1,id2):
        cur = self.cursor.execute(
            "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2))
        res = cur.fetchone()
        self.dbcommit()
        res = None if res == None else res[0]
        return res

    #个人信息
    def userinfo(self,user_id):
        try:
            status = self.isdupicateid("userinfo",user_id)
            if status==None:
                people = self.zhclient.people(user_id)
                record_time = self.logtime()
                address = "|".join([location.name for location in people.locations])
                school_name = "|".join([education.school.name for education in people.educations if "school" in education])
                job = "|".join([employment.job.name for employment in people.employments if "job" in employment])
                company = "|".join([employment.company.name for employment in people.employments if "company" in employment])
                business = people.business.name if people.business else None
                #勋章判断
                if people.badge.has_identity:
                    identity = people.badge.identity
                else:
                    identity = None
                if people.badge.is_best_answerer:
                    best_topics = "".join([topic.name for topic in people.badge.topics])
                else:
                    best_topics = None
                if people.badge.is_organization:
                    is_organization = 1
                    org_name = people.badge.org_name
                    org_home_page = people.badge.org_home_page
                    org_industry = people.badge.org_industry
                else:
                    is_organization = 0
                    org_name = None
                    org_home_page = None
                    org_industry = None
                values = (
                people.id, people.name, people.headline, people.gender, address, business, school_name, job,company,
                people.answer_count, people.question_count, people.voteup_count, people.thanked_count,
                people.following_count, people.follower_count, people.following_question_count,
                people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time)
                self.cursor.execute(
                    "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                    values)
                self.dbcommit()
                print("正在处理", people.name)
            else:
                print("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass

    def answerinfo(self,answer_id):
        try:
            status = self.isdupicateid("answerinfo", answer_id)
            if status == None:
                answer = self.zhclient.answer(answer_id)
                record_time = self.logtime()
                values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time)
                self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理",answer.id)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #问题信息
    def questioninfo(self,question_id):
        try:
            status = self.isdupicateid("questioninfo", question_id)
            if status == None:
                question = self.zhclient.question(question_id)
                record_time = self.logtime()
                values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time)
                self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理" ,question.title)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题信息
    def topicinfo(self,topic_id):
        try:
            status = self.isdupicateid("topicinfo", topic_id)
            if status == None:
                topic = self.zhclient.topic(topic_id)
                record_time = self.logtime()
                values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time)
                self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理", topic.name)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #时间戳
    def logtime(self):
        fmt = '%Y-%m-%d'  # 定义时间显示格式
        Date = time.strftime(fmt, time.localtime(time.time()))
        return Date


    def add_counts(self,filepath = "logincounts.txt"):
        counts = []
        for line in open(filepath):
            count = {}
            count["count"], count["key"] = line.split("----")
            count["key"] = count["key"].strip("\n")
            counts.append(count)
        return counts

    def get_proxy(self):
        try:
            PROXY_POOL_URL = 'http://localhost:5000/get'
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                return response.text
        except ConnectionError:
            return None
Beispiel #8
0
class ZhiHu(object):
    TOKEN_FILE = 'token.pkl'

    def __init__(self):
        """
        初始化
        """
        self.login_zhihu()
        self.db = EasySqlite('zhihu.db')

    def login_zhihu(self):
        """
        登录知乎
        :return:
        """
        self.client = ZhihuClient()
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def save_quesions(self, topic_id):
        """
        保存话题下的问题
        :param topic_id:
        :return:
        """
        topic = self.client.topic(topic_id)
        print(topic)
        questions = topic.unanswered_questions
        sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        for question in questions:
            if question.answer_count < 10:
                continue
            row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
                   topic_id]
            print(row)
            ret = self.db.update(sql_tmp, args=row)
            if not ret:
                print('insert error!')
            else:
                print('insert success!')

    def save_answer_info(self, question_id):
        """
        保存指定问题的答案概况
        :param question_id:
        :return:
        """
        question = self.client.question(question_id)
        print(question.title)
        answers = question.answers
        for answer in answers:
            print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count,
                  answer.voteup_count)
            answer.save()
            break
        # sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        # for question in questions:
        #     if question.answer_count < 10:
        #         continue
        #     row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
        #            topic_id]
        #     print(row)
        #     ret = self.db.update(sql_tmp, args=row)
        #     if not ret:
        #         print('insert error!')
        #     else:
        #         print('insert success!')

    def to_md(self, topic, file_name):
        sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic
        ret = self.db.query(sql)
        line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数:%s 回答数:%s 评论数:%s<br>\n"
        i = 1
        with open(file_name, 'w', encoding='utf8') as f:
            for item in ret:
                line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count'])
                f.write(line)
                i += 1
Beispiel #9
0

TOKEN_FILE = 'token.cache'
TOP_SIZE = 50

# Login 
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# The topest root of topic
root_topic = client.topic(19776749)
# The array to store top hot topic
hot_topics = list()
# Whether the hot_topics is full
hot_topics_full = False
# The fewest topic in hot_topics
last_topic = {}
# The file to output
file_name = 'result'
# The number of topics has been searched
search_count = 0
# How many times output once
output_time = 500
# How many logs to researve
log_num = 100
# 程序暂定活着中断后重新开始的树的层数
Beispiel #10
0
def login():
    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)

    """
    me = client.me()
    print('name', me.name)
    print('headline', me.headline)
    print('description', me.description)

    print('following topic count', me.following_topic_count)
    print('following people count', me.following_topic_count)
    print('followers count', me.follower_count)

    print('voteup count', me.voteup_count)
    print('get thanks count', me.thanked_count)

    print('answered question', me.answer_count)
    print('question asked', me.question_count)
    print('collection count', me.collection_count)
    print('article count', me.articles_count)
    print('following column count', me.following_column_count)

    # 获取最近 5 个回答
    for _, answer in zip(range(5), me.answers):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取点赞量最高的 5 个回答
    for _, answer in zip(range(5), me.answers.order_by('votenum')):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取最近提的 5 个问题
    for _, question in zip(range(5), me.questions):
        print(question.title, question.answer_count)

    print('----------')

    # 获取最近发表的 5 个文章
    for _, article in zip(range(5), me.articles):
        print(article.title, article.voteup_count)
    """
    topic = client.topic(19560072)  # 转基因
    # topic = client.topic(19578906)  # 气候变化
    # topic = client.topic(19551296)  # 网络游戏

    answers_count = 0
    for question in topic.unanswered_questions:
        print(question.id)
        print(question.title)
        print(question.answer_count)
        answers_count += question.answer_count
        for answer in question.answers:
            print(answer.author.id,answer.author.name)
            answer.save('Data\\Gene\\'+str(question.id)+'#'+question.title, str(answer.author.id)+'#'+answer.author.name)
    print("总共有{0}个回答".format(answers_count))
#     	print ("author: {0}".format(item.author.name))
#     	counter-=1


# a1 = client.answer(143216281)
# #https://www.zhihu.com/question/20251786/answer/143216281
# print (a1.author.answer_count)
# # author’s profile and influence.
# print ("name: {0}".format(a1.author.name))
# print ("collected_count: {0}".format(a1.author.collected_count))
# print ("favorited_count: {0}".format(a1.author.favorited_count))
# print ("follower_count: {0}".format(a1.author.follower_count))
# print ("voteup_count: {0}".format(a1.author.voteup_count))
# #print ("is_best_answerer: {0}".format(a1.author.is_best_answerer))


output_file = "./question.csv"
headers = ["Qid", "Followers", "Created_time", "Answer_count"]

#https://www.zhihu.com/topic/20019119/top-answers
topic1 = client.topic(20019119)
questions = topic1.unanswered_questions
rows = []
for v in questions:
	rows.append((v._id, v.follower_count, v.created_time, v.answer_count))

with open(output_file,'a') as f:
	f_csv = csv.writer(f)
	f_csv.writerow(headers)
	f_csv.writerows(rows)
Beispiel #12
0
from zhihu_oauth import ZhihuClient
from getUser import getUser
import MySQLdb

TOKEN_FILE = 'token.pkl'

# login
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

topic = client.topic()

if __name__ == '__main__':
    conn = MySQLdb.connect(host='127.0.0.1',
                           user='******',
                           passwd='root',
                           port=3306,
                           charset='utf8')
    cur = conn.cursor()
    u_table = 'topic_' + str(topic.name)
    cre_utable = 'create table IF NOT EXISTS %s (uid VARCHAR (50) PRIMARY KEY ,name VARCHAR (20),gender VARCHAR (10),headline VARCHAR (400),description VARCHAR (1000),que_count INT ,ans_count INT ,art_count INT ,column_ INT ,column_fol_sum INT ,collection INT ,coll_ans_sum INT ,coll_fol_sum INT ,voteup INT ,thanks INT ,collected INT ,shared INT ,art_vote_sum INT ,following INT ,follower INT ,fol_column INT ,fol_topic INT ,fol_topic_name MEDIUMTEXT,fol_ques INT ,location VARCHAR (200),business VARCHAR (50),school VARCHAR (200),major VARCHAR (200),company VARCHAR (200),job VARCHAR (200), avatar VARCHAR (10),avatar_url VARCHAR (100),weibo VARCHAR (10),weibo_name VARCHAR (50),weibo_url VARCHAR (50), give_ans_vote INT, give_art_vote INT, topic_name VARCHAR (200))' % u_table
    ins_utable = 'insert into ' + u_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'

    try:
        # cur.execute('set interactive_timeout=96*3600')
        # cur.execute('CREATE DATABASE IF NOT EXISTS zhihu DEFAULT CHARSET utf8 COLLATE utf8_unicode_ci')