def parse_author(command): result = Match.author(command) author_id = result.group('author_id') task = SingleTask() task.kind = 'author' task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id) task.book.kind = 'author' client = ZhihuClient() try: client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl')) except IOError: print u"没有找到登录信息文件,请先登录" sys.exit() except NeedLoginException: print u"登录信息过期,请重新登录" sys.exit() people_oauth = client.people(author_id) _ = people_oauth.follower_count # zhihu-oauth, issues #4 author_id_hash = people_oauth.id task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash) task.book.sql.question = 'select * from Question where question_id in (select question_id from \ Answer where author_id = "{}")'.format(author_id_hash) task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash) return task
# client.load_token('filename') from zhihu_oauth import ZhihuClient import re import os import urllib client = ZhihuClient() # 登录 # 加载token文件 client.load_token('token.pkl') #例如目标user的主页url为https://www.zhihu.com/people/leng-zhe/activities,则id为leng-zhe id = str('leng-zhe') people = client.people(id) #question = client.people.question(id) index = 1 # 图片序号 os.mkdir(id) path = id for answer in people.answers: #print('answer.question.title = ',answer.question.title) #path = 'people' content = answer.content # 回答内容 #print('answer.content = ',answer.content)
except NeedCaptchaException: # 保存验证码并提示输入,重新登录 print u'登录失败,需要输入验证码' with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = raw_input(u'please input captcha:') login_result = client.login(test_email, test_password, captcha) print 'login result => ' print login_result client.save_token(token_file) print 'save token success' # question response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构 question_id = 35005800 question = client.question(question_id) data = question.pure_data response_json = json.dumps(data) response_file = open(response_file_uri, 'w+') response_file.write(response_json) print u"数据保存完成" response_file_uri = './people_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构 people_id = '404-Page-Not-found' people = client.people(people_id) for i in people.answers: data = i.pure_data response_json = json.dumps(data) response_file = open(response_file_uri, 'w+') response_file.write(response_json) print u"数据保存完成"
from lxml import html import requests, time, zhihu_oauth start_time = time.time() # 初始时间戳 # ========================登录======================== from zhihu_oauth import ZhihuClient client = ZhihuClient() client.load_token('/Users/alicewish/我的坚果云/token.pkl') # ============用户模块============ pid = "edna-krabappel" people = client.people(pid) print('活动', people.activities) print('答案数', people.answer_count) print('答案', people.answers) print('文章', people.articles) print('文章数', people.articles_count) print('头像地址', people.avatar_url) print('用户所在行业', people.business) print('收藏数', people.collected_count) print('收藏夹数', people.collection_count) print('收藏夹', people.collections) print('专栏数', people.column_count) print('专栏', people.columns) print('专栏数', people.columns_count) print('创建时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(people.created_at))) print('个人描述', people.description) print('草稿数', people.draft_count)
line = line.strip().split('\t') user_id.append(line[0]) user_name = [] file = open("C:\\Users\\80693\\Desktop\\User_name.txt") lines = file.readlines() file.close() for line in lines: line = line.strip().split('\t') user_name.append(line[0]) num = len(user_id) link = np.zeros([num, num]) for i, peo in zip(range(num), user_id): p = client.people(pid=peo) for follow in p.followings: if follow.id in user_id: link[i][user_id.index(follow.id)] = 1 data_df = pd.DataFrame(link) data_df.columns = user_name data_df.index = user_name writer = pd.ExcelWriter('C:\\Users\\80693\\Desktop\\Link_mat.xlsx') data_df.to_excel(writer, 'page_1', float_format='%.5f') writer.save() data_df.to_csv('C:\\Users\\80693\\Desktop\\link.csv', index=False, header=False)
import time from datetime import datetime from zhihu_oauth import ZhihuClient from zhihu_oauth.zhcls.activity import ActType client = ZhihuClient() client.load_token('token.pkl') # replace it as the user input user = client.people('edward-fu-91') print('name', user.name) print('headline', user.headline) print('description', user.description) for act in user.activities: if act.type == ActType.CREATE_ANSWER: time = datetime.now() print(time.ctime()) print(act.target.created_time)
from __future__ import print_function from zhihu_oauth import ZhihuClient import sys import time if __name__ == '__main__': client = ZhihuClient() client.load_token('token.pkl') if len(sys.argv) != 3: print("python people <url|pid> <article|answer>") sys.exit(-1) pid = sys.argv[1] if pid[:4] == "http": pid = pid.split("/")[4] people = client.people(pid) task = sys.argv[2] print('start to process %s' % people.name) if task == 'article': for article in people.articles: article.save("%s_%s" % (people.name, 'article'), "%s_%s" % (article.title, article._id)) print("article %s has been saved." % article.title) time.sleep(1) if task == 'answer': for answer in people.answers: answer.save( "%s_%s" % (people.name, 'answer'), "%s_%s_%s" % (answer.id, answer.question.title, answer.voteup_count)) print("answer %s has been saved." % answer.question.title) time.sleep(1)
from zhihu_oauth import ZhihuClient TOKEN_FILE = 'token.pkl' # login client = ZhihuClient() if os.path.isfile(TOKEN_FILE): client.load_token(TOKEN_FILE) else: client.login_in_terminal() client.save_token(TOKEN_FILE) # test people class # people = client.people('suji-yan') people = client.people('zhang-jia-wei') # 2017.10.24测试live相关数据 print(people.live_count) print(people.hosted_live_count) print(people.participated_live_count) print(people.lives) # 举办的和参与的都在里面,都包括 for live in people.lives: print(live.__dict__) # id type str, 例如f9de84865e3e8455a09af78bfe4d1da5 print('id', people) print('name', people.name) # print('gender', people.gender, type(people.gender)) # print('headline', people.headline)
zhihudb = Mysql('zhihu1030') for i in lives: r = requests.get(get_liveurl_usingid(i), headers=headers)#get the url of the lives l_json = r.json() l = zhihuLive(l_json) l = vars(l) zhihudb.insert(l,'live') #break time.sleep(np.random.randint(1,3)) zhihudb.close() #speakers---------------------------------------------------------------------- #insert all speakers zhihudb = Mysql('zhihu1030') sql = " SELECT speaker_id FROM live" zhihudb.cursor.execute(sql) speakers = [column[0] for column in zhihudb.cursor.fetchall()] speakers = set(speakers) #s = 'a3a124c97bc93540a3c956c24bc3b465' for s in speakers: people = client.people(s) people = Speaker(people) test = vars(people) zhihudb.insert(test, 'speaker') time.sleep(np.random.randint(1,3)) zhihudb.close()