Beispiel #1
0
        def parse_author(command):
            result = Match.author(command)
            author_id = result.group('author_id')
            task = SingleTask()
            task.kind = 'author'
            task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
            task.book.kind = 'author'

            client = ZhihuClient()
            try:
                client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl'))
            except IOError:
                print u"没有找到登录信息文件,请先登录"
                sys.exit()
            except NeedLoginException:
                print u"登录信息过期,请重新登录"
                sys.exit()
            people_oauth = client.people(author_id)
            _ = people_oauth.follower_count    # zhihu-oauth, issues #4
            author_id_hash = people_oauth.id
            task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where author_id = "{}")'.format(author_id_hash)
            task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash)
            return task
Beispiel #2
0
# client.load_token('filename')

from zhihu_oauth import ZhihuClient
import re
import os
import urllib

client = ZhihuClient()
# 登录
# 加载token文件
client.load_token('token.pkl')

#例如目标user的主页url为https://www.zhihu.com/people/leng-zhe/activities,则id为leng-zhe
id = str('leng-zhe')

people = client.people(id)

#question = client.people.question(id)
index = 1  # 图片序号

os.mkdir(id)
path = id

for answer in people.answers:
    #print('answer.question.title = ',answer.question.title)

    #path = 'people'

    content = answer.content  # 回答内容
    #print('answer.content = ',answer.content)
Beispiel #3
0
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
question = client.question(question_id)
data = question.pure_data
response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

response_file_uri = './people_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
people_id = '404-Page-Not-found'
people = client.people(people_id)
for i in people.answers:
    data = i.pure_data
    response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"
from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ============用户模块============
pid = "edna-krabappel"

people = client.people(pid)
print('活动', people.activities)
print('答案数', people.answer_count)
print('答案', people.answers)
print('文章', people.articles)
print('文章数', people.articles_count)
print('头像地址', people.avatar_url)
print('用户所在行业', people.business)
print('收藏数', people.collected_count)
print('收藏夹数', people.collection_count)
print('收藏夹', people.collections)
print('专栏数', people.column_count)
print('专栏', people.columns)
print('专栏数', people.columns_count)
print('创建时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(people.created_at)))
print('个人描述', people.description)
print('草稿数', people.draft_count)
Beispiel #5
0
    line = line.strip().split('\t')
    user_id.append(line[0])


user_name = []
file = open("C:\\Users\\80693\\Desktop\\User_name.txt")
lines = file.readlines()
file.close()
for line in lines:
    line = line.strip().split('\t')
    user_name.append(line[0])

num = len(user_id)
link = np.zeros([num, num])
for i, peo in zip(range(num), user_id):
    p = client.people(pid=peo)
    for follow in p.followings:
        if follow.id in user_id:
            link[i][user_id.index(follow.id)] = 1


data_df = pd.DataFrame(link)
data_df.columns = user_name
data_df.index = user_name
writer = pd.ExcelWriter('C:\\Users\\80693\\Desktop\\Link_mat.xlsx')
data_df.to_excel(writer, 'page_1', float_format='%.5f')
writer.save()

data_df.to_csv('C:\\Users\\80693\\Desktop\\link.csv',
               index=False, header=False)
Beispiel #6
0
import time
from datetime import datetime
from zhihu_oauth import ZhihuClient
from zhihu_oauth.zhcls.activity import ActType

client = ZhihuClient()
client.load_token('token.pkl')
# replace it as the user input
user = client.people('edward-fu-91')

print('name', user.name)
print('headline', user.headline)
print('description', user.description)

for act in user.activities:
    if act.type == ActType.CREATE_ANSWER:
        time = datetime.now()
        print(time.ctime())
        print(act.target.created_time)
Beispiel #7
0
from __future__ import print_function
from zhihu_oauth import ZhihuClient
import sys
import time

if __name__ == '__main__':
    client = ZhihuClient()
    client.load_token('token.pkl')
    if len(sys.argv) != 3:
        print("python people <url|pid> <article|answer>")
        sys.exit(-1)
    pid = sys.argv[1]
    if pid[:4] == "http":
        pid = pid.split("/")[4]
    people = client.people(pid)

    task = sys.argv[2]
    print('start to process %s' % people.name)
    if task == 'article':
        for article in people.articles:
            article.save("%s_%s" % (people.name, 'article'),
                         "%s_%s" % (article.title, article._id))
            print("article %s has been saved." % article.title)
            time.sleep(1)
    if task == 'answer':
        for answer in people.answers:
            answer.save(
                "%s_%s" % (people.name, 'answer'), "%s_%s_%s" %
                (answer.id, answer.question.title, answer.voteup_count))
            print("answer %s has been saved." % answer.question.title)
            time.sleep(1)
Beispiel #8
0
from zhihu_oauth import ZhihuClient

TOKEN_FILE = 'token.pkl'

# login
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# test people class
# people = client.people('suji-yan')
people = client.people('zhang-jia-wei')

# 2017.10.24测试live相关数据
print(people.live_count)
print(people.hosted_live_count)
print(people.participated_live_count)
print(people.lives)
# 举办的和参与的都在里面,都包括
for live in people.lives:
    print(live.__dict__)

# id type str, 例如f9de84865e3e8455a09af78bfe4d1da5
print('id', people)
print('name', people.name)
# print('gender', people.gender, type(people.gender))
# print('headline', people.headline)
zhihudb = Mysql('zhihu1030')

for i in lives:
    r = requests.get(get_liveurl_usingid(i), headers=headers)#get the url of the lives
    l_json = r.json()
    l = zhihuLive(l_json)
    l = vars(l)
    zhihudb.insert(l,'live')
    #break
    time.sleep(np.random.randint(1,3))

zhihudb.close()

#speakers----------------------------------------------------------------------      
#insert all speakers

zhihudb = Mysql('zhihu1030')
sql = " SELECT speaker_id FROM live"
zhihudb.cursor.execute(sql)
speakers = [column[0] for column in zhihudb.cursor.fetchall()]
speakers = set(speakers)
#s = 'a3a124c97bc93540a3c956c24bc3b465'
for s in speakers:
    people = client.people(s)
    people = Speaker(people)
    test = vars(people)
    zhihudb.insert(test, 'speaker')
    time.sleep(np.random.randint(1,3))

zhihudb.close()