Beispiel #1
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #2
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #3
0
def main():
    #login
    client = ZhihuClient()
    log_in(client)
    Image('./a.gif')
    captcha = input('please input captcha:')
    client.login('account', 'psw', captcha)

    get_data(client)
    write_hk_student_info()
    write_modules()
Beispiel #4
0
def login(username, password):
    from zhihu_oauth import ZhihuClient
    from zhihu_oauth.exception import NeedCaptchaException
    client = ZhihuClient()
    try:
        client.login(username, password)
        print(u"登陆成功!")
    except NeedCaptchaException:  # 处理要验证码的情况
        # 保存验证码并提示输入,重新登录
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login(username, password, captcha)
    client.save_token('token.pkl')  # 保存token
Beispiel #5
0
def login(account, password):
    client = ZhihuClient()
    try:
        client.load_token(TOKEN_FILE)
    except FileNotFoundError:
        try:
            client.login(account, password)
        except NeedCaptchaException:
            # 保存验证码并提示输入,重新登录
            with open('./captcha/a.gif', 'wb') as f:
                f.write(client.get_captcha())
            captcha = input('please input captcha:')
            client.login(account, password, captcha)
            client.save_token('./token/token.pkl')
    finally:
        return client
def main():
    client = ZhihuClient()

    try:
        # client.login(email_or_phone, password)
        client.login_in_terminal(username=email_or_phone, password=password)
        client.save_token(TOKEN_FILE)          # 保存登录会话,留着以后登录用
        # raise NeedCaptchaException
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('请输入验证码: ')
        client.login(email_or_phone, password, captcha)

    data_out_list_a = []
    line_saved = 0
    max_lines = 1

    with open(USER_CSV_PATH) as file:
        for line in file.readlines():
            crawl_id = line.strip('\n')
            my_crawl = MyCrawler(crawl_id, client)
            print('------>>>| 待爬取的用户的知乎id为: ', crawl_id)

            data_a = my_crawl.crawling_answer(crawl_id)
            print('该用户爬取完毕'.center(60, '*'))
            if len(data_a) % 60 == 0:
                tmp_time = int(len(data_a) / 60)
                for i in range(tmp_time):
                    data_out_list_a.append(data_a[60*i:60*(i+1)])
            else:
                print('无用的输出!')

            # sleep(randint(1, 3))
            line_saved += 1

            if line_saved == max_lines:
                save_to_csv_a(data_out_list_a, client)
                data_out_list_a = []
                line_saved = 0

    print('全部用户采集完毕'.center(40, '*'))
Beispiel #7
0
def main():
    client = ZhihuClient()

    try:
        client.login('*****@*****.**', 'durant')

    except NeedCaptchaException:
        print("Login Error")
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('*****@*****.**', 'durant', captcha)

    max_lines = 1
    line_saved = 0
    data_out_list_a = []

    with open(USER_TRY_CSV_PATH) as f:
        for line in f.readlines():
            craw_id = line.strip("\n")
            craw = MyCrawler(craw_id, client)
            print(craw_id)

            data_a = craw.crawling_answer(craw_id)
            if len(data_a) % 60 == 0:
                times = int(len(data_a) / 60)
                for i in range(times):
                    data_out_list_a.append(data_a[60 * i:60 * (i + 1)])
            else:
                print("Invalid Output")

            a = random.randint(1, 3)
            time.sleep(a)

            line_saved += 1

            if line_saved == max_lines:
                save_to_csv_a(data_out_list_a)

                data_out_list_a = []

                line_saved = 0
Beispiel #8
0
class Crawler:
    # Initialize the crawler with the name of database
    def __init__(self, dbname,email,key):
        self.con = sqlite3.connect(dbname)
        self.cursor = self.con.cursor()
        TOKEN_FILE = 'token.pkl'
        self.zhclient = ZhihuClient()
        try:
            # self.zhclient.login_in_terminal(email, key)
            self.zhclient.login(email, key)
        except NeedCaptchaException:
            print("需要输入验证码,账号 %s 可能已失效" %(email))
        # if os.path.isfile(TOKEN_FILE):
        #     self.zhclient.load_token(TOKEN_FILE)
        # else:
        #     self.zhclient.login_in_terminal(email, key)
        #     self.zhclient.save_token(TOKEN_FILE)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    #建立数据表
    def createindextables(self):
        self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)')
        self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)')
        self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)')
        self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)')

        self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)')
        self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)')
        self.cursor.execute('create table user_users(user_id,user_follower_id)')
        self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)')
        self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)')

        self.cursor.execute('create index userinfoidx on userinfo(id)')
        self.cursor.execute('create index answerinfoidx on answerinfo(id)')
        self.cursor.execute('create index questioninfoidx on questioninfo(id)')
        self.cursor.execute('create index topicinfoidx on topicinfo(id)')

        self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)')
        self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)')
        self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)')
        self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)')
        self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)')
        self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)')
        self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)')

        self.dbcommit()

    # #多线程尝试
    # def crawl_data(self,work_set,table1,field1,table2,field2):
    #     if table2 == "userinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.userinfo(subid)
    #     elif table2 == "answerinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.answerinfo(subid)
    #             # time.sleep(0.8)
    #             # time.sleep(0.5)
    #     elif table2 == "questioninfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.questioninfo(subid)
    #     elif table2 == "topicinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topicinfo(subid)
    #     elif table2 == "question_answers":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_answers(subid)
    #     elif table2 == "question_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_topics(subid)
    #     elif table2 == "question_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_users(subid)
    #     elif table2 == "topic_questions":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_questions(subid)
    #     elif table2 == "topic_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_users(subid)
    #     elif table2 == "user_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_users(subid)
    #     elif table2 == "user_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_topics(subid)
    #     return None


    def justdoit(self,table1,field1,table2,field2):
        set2 =set(self.cursor.execute("select DISTINCT  {} from {}".format(field2,table2)).fetchall())
        set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall())
        work_set = set1-set2
        # work_set = list(set1 - set2)
        # splitlen = int(len(work_set) / 2)
        # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)]
        # threads = []
        # for i in range(0,len(subwork_set)):
        #     t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2))
        #     threads.append(t)
        # for t in threads:
        #     t.start()
        #     t.join()
        if table2 == "userinfo":
            for subid in work_set:
                subid = subid[0]
                self.userinfo(subid)
        elif table2 == "answerinfo":
            for subid in work_set:
                subid = subid[0]
                self.answerinfo(subid)
                # time.sleep(1.0)
                time.sleep(0.1)
        elif table2 == "questioninfo":
            for subid in work_set:
                subid = subid[0]
                self.questioninfo(subid)
        elif table2 == "topicinfo":
            for subid in work_set:
                subid = subid[0]
                self.topicinfo(subid)
        elif table2 == "question_answers":
            for subid in work_set:
                subid = subid[0]
                self.question_answers(subid)
        elif table2 == "question_topics":
            for subid in work_set:
                subid = subid[0]
                self.question_topics(subid)
        elif table2 == "question_users":
            for subid in work_set:
                subid = subid[0]
                self.question_users(subid)
        elif table2 == "topic_questions":
            for subid in work_set:
                subid = subid[0]
                self.topic_questions(subid)
        elif table2 == "topic_users":
            for subid in work_set:
                subid = subid[0]
                self.topic_users(subid)
        elif table2 == "user_users":
            for subid in work_set:
                subid = subid[0]
                self.user_users(subid)
        elif table2 == "user_topics":
            for subid in work_set:
                subid = subid[0]
                self.user_topics(subid)
        return None

    #话题-(精华)问题关系
    def topic_questions(self,topic_id):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            ques_set = set()
            for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id)
                if status == None:
                    if hot_ques.question.id not in ques_set:
                        ques_set.add(hot_ques.question.id)
                        values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time)
                        self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理", hot_ques.question.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            raise
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题-关注者关系
    def topic_users(self,topic_id,start_at = 0):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (topic.id,topic.name,follower.id,follower.name,record_time)
                        self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理",topic.name,follower.name)
                        # time.sleep(0.3)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-关注者关系
    def question_users(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(question.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (question.id, question.title, follower.id, follower.name,record_time)
                        self.cursor.execute(
                            "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理",follower.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-回答关系
    def question_answers(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            answer_set = set()
            for answer in shield(question.answers):
                status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id)
                if status == None:
                    if answer.id not in answer_set:
                        answer_set.add(answer.id)
                        values = (question.id, question.title, answer.id, answer.author.id,record_time)
                        self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理", question.id, question.title, answer.id, answer.author.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except ZhihuWarning:
            print("Pass the UnexpectedResponseException")
            pass

    #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝
    def user_users(self,user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(people.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        valus = (people.id,follower.id,record_time)
                        self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus)
                        self.dbcommit()
                        print("正在处理",follower.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #获取问题-话题关系
    def question_topics(self,question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(question.topics):
                status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (question.id,topic.id,topic.name,record_time)
                        self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values)
                        self.dbcommit()
                        print("正在处理", topic.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 获取用户-话题关系
    def user_topics(self, user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(people.following_topics):
                status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (people.id, people.name, topic.id,topic.name, record_time)
                        self.cursor.execute(
                            "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)",
                            values)
                        self.dbcommit()
                        print("正在处理", people.name ,topic.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 判断数据重复
    def isdupicateid(self, table, id):
        cur = self.cursor.execute(
            "select rowid from {} where id = ?".format(table), (id,))
        self.dbcommit()
        res = cur.fetchone()
        res = None if res == None else res[0]
        return res

    def isdupicaterel(self,table,field1,field2, id1,id2):
        cur = self.cursor.execute(
            "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2))
        res = cur.fetchone()
        self.dbcommit()
        res = None if res == None else res[0]
        return res

    #个人信息
    def userinfo(self,user_id):
        try:
            status = self.isdupicateid("userinfo",user_id)
            if status==None:
                people = self.zhclient.people(user_id)
                record_time = self.logtime()
                address = "|".join([location.name for location in people.locations])
                school_name = "|".join([education.school.name for education in people.educations if "school" in education])
                job = "|".join([employment.job.name for employment in people.employments if "job" in employment])
                company = "|".join([employment.company.name for employment in people.employments if "company" in employment])
                business = people.business.name if people.business else None
                #勋章判断
                if people.badge.has_identity:
                    identity = people.badge.identity
                else:
                    identity = None
                if people.badge.is_best_answerer:
                    best_topics = "".join([topic.name for topic in people.badge.topics])
                else:
                    best_topics = None
                if people.badge.is_organization:
                    is_organization = 1
                    org_name = people.badge.org_name
                    org_home_page = people.badge.org_home_page
                    org_industry = people.badge.org_industry
                else:
                    is_organization = 0
                    org_name = None
                    org_home_page = None
                    org_industry = None
                values = (
                people.id, people.name, people.headline, people.gender, address, business, school_name, job,company,
                people.answer_count, people.question_count, people.voteup_count, people.thanked_count,
                people.following_count, people.follower_count, people.following_question_count,
                people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time)
                self.cursor.execute(
                    "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                    values)
                self.dbcommit()
                print("正在处理", people.name)
            else:
                print("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass

    def answerinfo(self,answer_id):
        try:
            status = self.isdupicateid("answerinfo", answer_id)
            if status == None:
                answer = self.zhclient.answer(answer_id)
                record_time = self.logtime()
                values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time)
                self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理",answer.id)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #问题信息
    def questioninfo(self,question_id):
        try:
            status = self.isdupicateid("questioninfo", question_id)
            if status == None:
                question = self.zhclient.question(question_id)
                record_time = self.logtime()
                values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time)
                self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理" ,question.title)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题信息
    def topicinfo(self,topic_id):
        try:
            status = self.isdupicateid("topicinfo", topic_id)
            if status == None:
                topic = self.zhclient.topic(topic_id)
                record_time = self.logtime()
                values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time)
                self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理", topic.name)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #时间戳
    def logtime(self):
        fmt = '%Y-%m-%d'  # 定义时间显示格式
        Date = time.strftime(fmt, time.localtime(time.time()))
        return Date


    def add_counts(self,filepath = "logincounts.txt"):
        counts = []
        for line in open(filepath):
            count = {}
            count["count"], count["key"] = line.split("----")
            count["key"] = count["key"].strip("\n")
            counts.append(count)
        return counts

    def get_proxy(self):
        try:
            PROXY_POOL_URL = 'http://localhost:5000/get'
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                return response.text
        except ConnectionError:
            return None
#-*- coding:utf-8 -*-

from __future__ import print_function
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('邮箱或电话', '密码')
    # 使用手机号登录需要在手机号前加 +86 前缀
except NeedCaptchaException:
    # 保存验证码并提示输入,输入gif所示验证码进行登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('邮箱或电话', '密码', captcha)

article = client.from_url('https://zhuanlan.zhihu.com/p/25671089')
# 随意改知乎链接,自动识别

print(article.author.name)
print(article.voteup_count)
result = ''
for i in article.comments:
    j = i.author.name + '\n'
    result += j

f = open('E:\GitHouse\zhihu-oauth\jilu4.txt', 'w')
f.write(result)
f.close()
Beispiel #10
0
#coding=utf-8

from __future__ import print_function  # 使用python3的print方法
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('account', 'password')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = raw_input('please input captcha:')
    client.login('account', 'password', captcha)

client.save_token('token.pkl')  # 保存token
#有了token之后,下次登录就可以直接加载token文件了
# client.load_token('filename')

from zhihu_oauth import ZhihuClient
import re
import os
import urllib

client = ZhihuClient()
# 登录
# 加载token文件
client.load_token('token.pkl')
Beispiel #11
0
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('#Email#', '#Password#')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('email_or_phone', 'password', captcha)

client.save_token('token.pkl')
Beispiel #12
0
        str_sentiment = '正向'

    out = {}
    out['confidence'] = str(confidence)
    out['positive'] = str(positive_prob)
    out['negative'] = str(negative_prob)
    out['sentiment'] = str_sentiment

    return out



client = ZhihuClient()
#登录部分
try:
    client.login(ZHIHU_ID, ZHIHU_KEY)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(ZHIHU_ID, ZHIHU_KEY, captcha)


the_question = client.question(QUESTION_ID)


print(the_question.title)


Beispiel #13
0
import os
import requests
from pyquery import PyQuery as pq
from requests import RequestException
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
import time
import csv

today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
now = time.strftime('%H', time.localtime(time.time()))
total_detail = []
client = ZhihuClient()
try:
    client.login('#', '#')
    # 使用手机号登录需要在手机号前加 +86 前缀
except NeedCaptchaException:
    # 保存验证码并提示输入,输入gif所示验证码进行登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('#', '#', captcha)


def get_links(Num):
    url = 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A' + Num + '%2C"type"%3A"day"%7D'
    header = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
Beispiel #14
0
from zhihu_oauth.exception import NeedCaptchaException
from bs4 import BeautifulSoup
import json
import random

s = requests.session()
s.keep_alive = True

# In[2]:

# login ZhihuClient
client = ZhihuClient()
user = '******'
pwd = '961204yy'
try:
    client.login(user, pwd)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(user, pwd, captcha)
client.save_token('token.kpl')
# TOKEN_FILE = 'token.pkl'
#
# if os.path.isfile(TOKEN_FILE):
#     client.load_token(TOKEN_FILE)
# else:
#     client.login_in_terminal()
#     client.save_token(TOKEN_FILE)
Beispiel #15
0
from openpyxl import Workbook
from openpyxl import load_workbook
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException


topic_id = input('请输入话题id号,获取所有答案:')
file_name = input('请输入保存文件名,不带后缀:')
TOKEN_FILE = 'token.pkl'
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    try:
        client.login('email_or_phone', 'password')
    except NeedCaptchaException:
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('email_or_phone', 'password', captcha)
    client.save_token(TOKEN_FILE)

topic = client.topic(int(topic_id))
print(topic.name)

#日志设置
logging.basicConfig(level=logging.ERROR,  
                format='%(asctime)s %(levelname)s %(message)s',  
                datefmt='%Y-%m-%d %H:%M:%S',
                filename='zhi.log',
text_readline = []  # 初始化按行存储数据列表,不接受结尾换行符
with open(txt_file_path) as fin:
    for line in fin:
        text_readline.append((line).replace('\n', ''))
print(text_readline)

for i in range(len(text_readline)):
    print(text_readline[i])
# ================读取账号和密码================
account = text_readline[0]
passward = text_readline[1]

client = ZhihuClient()

try:
    client.login(account, passward)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(account, passward, captcha)

# 必须在 client 已经处于登录状态时才能使用
client.save_token('/Users/alicewish/我的坚果云/token.pkl')

# ================运行时间计时================
run_time = time.time() - start_time
if run_time < 60:  # 两位小数的秒
    print("耗时:{:.2f}秒".format(run_time))
elif run_time < 3600:  # 分秒取整
Beispiel #17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/1/15 17:40
# @Author  : glacier
# @Site    :
# @File    : zhihu_new.py
# @Software: PyCharm Edu

from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
import pymysql

client = ZhihuClient()

try:
    client.login('13776390465', '14715912300.mm')
    # 必须在 client 已经处于登录状态时才能使用
    client.save_token('token.pkl')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('13776390465', '14715912300.mm', captcha)

client.load_token('token.pkl')
me = client.me()

import traceback

client = ZhihuClient()
Beispiel #18
0
#! /usr/bin/env python3
# coding: utf-8

username = '******'
pwd = 'zhihu password'

username = '******'
pwd = '!fGP+GT5dSK*'

#-----------------------------
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login(username, pwd)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(username, pwd, captcha)

client.save_token('token.pk1') # 保存 token
Beispiel #19
0
### crawls the question from zhihu.com by using the module zhihu_oauth

from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()
try:
    client.login(zhihu_account, zhihu_password)
except NeedCaptchaException:
    ### here we need to save the CAPTCHA and relogin
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(zhihu_account, zhihu_password, captcha)

### the id of zhihu.com is not always continuous, we have to traverse all the id
q = []
for id in range(281736391, 311982210):
    question = client.question(id)
    try:
        q.append(question.title)
    except zhihu_oauth.exception.GetDataErrorException:
        continue

### I totally crawled 14W+ questions from zhihu.com
Beispiel #20
0
import pandas as pd
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
from zhihu_oauth.exception import NeedCaptchaException

try:
    client.login('*****@*****.**', 'justbemyself1998')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('*****@*****.**', 'justbemyself1998', captcha)
    client.save_token("token.pkl")
Beispiel #21
0
# coding=utf-8

from __future__ import unicode_literals, print_function

import os

from zhihu_oauth import ZhihuClient


TOKEN_FILE = 'token.pkl'


client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login('*****@*****.**', 'Zhihu2Ebook')
    client.save_token(TOKEN_FILE)

Beispiel #22
0
from zhihu_oauth import ZhihuClient

from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

test_email = '*****@*****.**'
test_password = '******'
token_file = './token.pkl'

if os.path.lexists(token_file):
    client.load_token(token_file)
    print 'load token success'
else:
    try:
        login_result = client.login(test_email, test_password)
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html'  # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
Beispiel #23
0
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()
try:
    client.login('account', 'pwd')
except NeedCaptchaException:
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('account', 'pwd', captcha)
    print("Exception")
client.save_token('token.pkl')
Beispiel #24
0
#!/usr/bin/env python
# coding: utf-8
import os
from zhihu_oauth import ZhihuClient, ActType, People
from zhihu_oauth.exception import NeedCaptchaException
from zhihu_oauth.helpers import ts2str, act2str

token = './XXX.pk1'
client = ZhihuClient()

try:
    if os.path.exists(token):
        client.load_token(token)
    else:
        client.login('username', 'passwd')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = raw_input('please input captcha:')
    client.login('username', 'passwd', captcha)
client.save_token(token)


def dump_activities(pid):
    person = client.people(pid)
    filter_types = {
        ActType.COLLECT_ANSWER,
        ActType.COLLECT_ANSWER,
        ActType.COLLECT_ARTICLE,
        ActType.CREATE_ANSWER,
from requests import RequestException
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
import time
import csv
from smtplib import SMTP_SSL
from email.header import Header
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
now = time.strftime('%H', time.localtime(time.time()))
total_detail = []
client = ZhihuClient()
try:
    client.login('ID', 'password')
    # 使用手机号登录需要在手机号前加 +86 前缀
except NeedCaptchaException:
    # 保存验证码并提示输入,输入gif所示验证码进行登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('ID', 'password', captcha)


def get_links(Num):
    url = 'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A' + Num + '%2C"type"%3A"day"%7D'
    header = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
Beispiel #26
0
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  7 18:02:01 2017

@author: roger
"""
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    (state, reason) = client.login('*****@*****.**', 'qi142857')
    print(state, reason)

except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('email_or_phone', 'password', captcha)

client.save_token('./token.pkl')
Beispiel #27
0
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('*****@*****.**', '449137973zazazzh')
    client.save_token('token.pkl')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('email_or_phone', 'password', captcha)
Beispiel #28
0
from zhihu_oauth  import  ZhihuClient

from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

test_email = '*****@*****.**'
test_password = '******'
token_file = './token.pkl'

if os.path.lexists(token_file):
    client.load_token(token_file)
    print 'load token success'
else:
    try:
        login_result = client.login(test_email, test_password)
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
Beispiel #29
0
class Crawl:
    def __init__(self):
        self.client = ZhihuClient()

    def login(self, username, password):
        if os.path.isfile('app/Resource/' + username + '.token'):
            self.client.load_token('app/Resource/' + username + '.token')
        else:
            try:
                self.client.login(username, password)
            except NeedCaptchaException:
                # 保存验证码并提示输入,重新登录
                with open('a.gif', 'wb') as f:
                    f.write(self.client.get_captcha())
                captcha = input('please input captcha:')
                self.client.login(username, password, captcha)
            self.client.save_token('app/Resource/' + username + '.token')

    def get_live_list(self):
        lives = self.client.me().lives
        return lives

    @staticmethod
    def save_live_list(livedata):
        new_live = MyLive(live_id=livedata.id,
                          title=livedata.title,
                          speaker=livedata.speaker.name,
                          speaker_description=livedata.speaker.description,
                          live_description=livedata.description,
                          seats_count=livedata.seat_taken,
                          price=livedata.fee)
        new_live.save()

    def live_list_work(self):
        for live in self.get_live_list():
            exist = MyLive.objects(live_id=live.id)
            if not exist:
                self.save_live_list(live)

    def get_live_content(self, live_id, before_id=''):
        res = self.client._session.get(
            LIVECONTENT_URL.format(live_id, before_id))
        data = json.loads(res.content)
        return data

    def save_live_content_image(self, id, url):
        content = self.client._session.get(url).content
        file = 'app/Resource/' + str(id) + '.png'
        with open(file, 'wb') as f:
            f.write(content)

    @staticmethod
    def save_live_content(live_id, livedata):
        for r in livedata['data']:
            exist = LiveContent.objects(message_id=r['id'])
            if exist:
                continue

            if r['type'] == 'audio':
                url = r['audio']['url']
            elif r['type'] == 'image':
                url = r['image']['full']['url']

            else:
                url = ''
            content = r['text'] if 'text' in r else ''
            reply = ','.join(r['replies']) if 'replies' in r else ''

            new_live_content = LiveContent(
                message_id=int(r['id']),
                sender=r['sender']['member']['name'],
                type=r['type'],
                content=content,
                url=url,
                reply=reply,
                likes=r['likes']['count'],
                created_at=datetime.fromtimestamp((r['created_at'])),
                live_title=live_id)
            new_live_content.save()

    def live_content_work(self, id):
        live = MyLive.objects(id=id).first()
        # 使用知乎的live的ID值传入获取详情
        data = self.get_live_content(live.live_id)
        while data['unload_count'] > 0:
            # 存储时使用mongo的ID值传入
            self.save_live_content(live.id, data)
            data = self.get_live_content(live.live_id, data['data'][0]['id'])
        else:
            print('success')

        image_contents = LiveContent.objects(live_title=live.id, type='image')
        for item in image_contents:
            self.save_live_content_image(item.id, item.url)
Beispiel #30
0
wait = WebDriverWait(browser, 10)
seachAddress = [
    'https://www.zhihu.com/explore',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A5%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A10%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A15%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A20%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A25%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A30%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A35%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A40%2C"type"%3A"day"%7D',
    'https://www.zhihu.com/node/ExploreAnswerListV2?params=%7B"offset"%3A45%2C"type"%3A"day"%7D',
]
client = ZhihuClient()
try:
    client.login('', '')
    # 使用手机号登录需要在手机号前加 +86 前缀
except NeedCaptchaException:
    # 保存验证码并提示输入,输入gif所示验证码进行登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('', '', captcha)


def get_links(url):
    try:
        browser.get(url)
        wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'div:nth-child(1) > h2 > a')))
Beispiel #31
0
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

#custom your email/phone number matched with password
email = ""
password = ""

client = ZhihuClient()

try:
    client.login(email, password)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(email, password, captcha)

    client.save_token('token.pkl')
Beispiel #32
0
import time
import random
# 账户信息以及问题ID
account = 'account'
passwd = 'passwd'
questionID = 264747923
# 建立数据库
dbClient = pymongo.MongoClient(host='localhost', port=27017)
Zhihu = dbClient['Zhihu']
ZhihuData = Zhihu[str(questionID)]
if ZhihuData.find():
        ZhihuData.remove({})
# 登陆知乎账号
client = ZhihuClient()
try:
    client.login(account, passwd)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(account, passwd, captcha)
# 创建问题对象
question = client.question(questionID)
# 读取问题下所有的回答并保存起来
print(question.title)
count = 0
for answer in question.answers:
    count+=1
    try:
        data = {
Beispiel #33
0
# coding=utf-8

from __future__ import unicode_literals, print_function

import os

from zhihu_oauth import ZhihuClient

TOKEN_FILE = 'token.pkl'

client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login('*****@*****.**', 'Zhihu2Ebook')
    client.save_token(TOKEN_FILE)
Beispiel #34
0
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

try:
    client.login('*****@*****.**', '110119rick')
except NeedCaptchaException:

    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login('*****@*****.**', '110119rick', captcha)