Beispiel #1
0
def crawling(id):
    #id为问题id
    client = ZhihuClient()
    # 登录
    client.load_token('token.pkl')  # 加载token文件
    question = client.question(id)
    print(u"问题:", question.title)
    print(u"回答数量:", question.answer_count)
    if not os.path.exists(question.title):
        os.mkdir(question.title)
    path = question.title
    index = 1  # 图片序号
    for i, answer in enumerate(question.answers):
        content = answer.content  # 回答内容
        anther = answer.author.name
        re_compile = re.compile(
            r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
        img_lists = re.findall(re_compile, content)
        if (img_lists):
            for img in img_lists:
                img_url = img[0]  # 图片url
                image_name = anther + '_' + str(index) + '.jpg'
                if not os.path.exists(path + '/' + image_name):
                    urllib.request.urlretrieve(img_url,
                                               path + '/' + image_name)
                    print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" %
                          (index, image_name, i / question.answer_count * 100))
                index += 1
        print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))
Beispiel #2
0
def download(cid):
    client = ZhihuClient()
    client.load_token('token.pk1')  # 登陆
    column = client.column(cid)
    images_dir = os.path.join(column.title, 'images')
    regex = re.compile(r"https://pic\d.zhimg.com/", re.IGNORECASE)

    if not os.path.exists(column.title):
        os.makedirs(column.title)

    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    for index, article in enumerate(column.articles):
        # 处理文章
        article_f = dealArticle(article)

        # 下载图片
        downloadImg(article_f['content'], images_dir)

        # 替换图片路径
        article_f['content'] = re.sub(regex, './images/', article_f['content'])

        # 写入
        with open(os.path.join(column.title, article_f['title']), 'w+') as f:
            f.write(article_f['content'])

        print('[%s] download %s success!' % (str(index), article.title))
Beispiel #3
0
        def parse_author(command):
            result = Match.author(command)
            author_id = result.group('author_id')
            task = SingleTask()
            task.kind = 'author'
            task.spider.href = 'https://www.zhihu.com/people/{}'.format(author_id)
            task.book.kind = 'author'

            client = ZhihuClient()
            try:
                client.load_token(Path.pwd_path + str(u'/ZHIHUTOKEN.pkl'))
            except IOError:
                print u"没有找到登录信息文件,请先登录"
                sys.exit()
            except NeedLoginException:
                print u"登录信息过期,请重新登录"
                sys.exit()
            people_oauth = client.people(author_id)
            _ = people_oauth.follower_count    # zhihu-oauth, issues #4
            author_id_hash = people_oauth.id
            task.book.sql.info = 'select * from AuthorInfo where author_id = "{}"'.format(author_id_hash)
            task.book.sql.question = 'select * from Question where question_id in (select question_id from \
            Answer where author_id = "{}")'.format(author_id_hash)
            task.book.sql.answer = 'select * from Answer where author_id = "{}"'.format(author_id_hash)
            return task
Beispiel #4
0
def getLatestBestAnserwerAndSave():
    # phoneNum = '+8613096348217'
    # pw = '2015141463222'

    ans_num = 20
    i=0


    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)

    # try:
    #     client.login(phoneNum, pw)
    # except NeedCaptchaException:
    #     # 保存验证码并提示输入,重新登录
    #     with open('a.gif', 'wb') as f:
    #         f.write(client.get_captcha())
    #     captcha = input('please input captcha:')
    #     client.login(phoneNum, pw, captcha)

    java = client.topic(19550867)
    BA = java.best_answers
    for answ in BA:
        ansItem2artical(ansItem(answ)).save()
        i = i+1

        if i==ans_num:
            break
Beispiel #5
0
def LoginZhihuClient(token_name):
    TOKEN_FILE = 'liuximing.pkl'
    client = ZhihuClient()
    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)
    me = client.me()
    return me
Beispiel #6
0
    def login(self):
        TOKEN_FILE = 'token.pkl'

        client = ZhihuClient()

        if os.path.isfile(TOKEN_FILE):
            client.load_token(TOKEN_FILE)
        else:
            client.login_in_terminal()
            client.save_token(TOKEN_FILE)
        return client
Beispiel #7
0
 def get_client(self, reset_=0):
     client = ZhihuClient()
     if reset_ != 0:
         client.login_in_terminal()
         client.save_token(TOKEN_FILE)
     if os.path.isfile(TOKEN_FILE):
         client.load_token(TOKEN_FILE)
     else:
         client.login_in_terminal()
         client.save_token(TOKEN_FILE)
     return client
Beispiel #8
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #9
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #10
0
class Login():
    def __init__(self):
        self.TOKEN_FILE = 'token.pkl.' + str(sys.version_info[0])
        self.client = ZhihuClient()

    def client_login(self):
        if not os.path.isfile(self.TOKEN_FILE):
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)
        else:
            self.client.load_token(self.TOKEN_FILE)
        return self.client
Beispiel #11
0
def zhihu_login():
    r"""
    知乎登陆
    :return:        登陆之后的客户端client
    """
    client = ZhihuClient()
    # 登录
    if os.path.isfile(TOKEN_FILE_NAME):
        client.load_token(TOKEN_FILE_NAME)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE_NAME)
    return client
Beispiel #12
0
def login():
    client = ZhihuClient()
    # try:
    #     client.login('username', 'password')
    # except NeedCaptchaException:
    #     # 保存验证码并提示输入,重新登录
    #     with open('a.gif', 'wb') as f:
    #         f.write(client.get_captcha())
    #     captcha = input('please input captcha:')
    #     client.login('email_or_phone', 'password', captcha)
    # client.save_token('token.pkl')
    client.load_token('/Users/huangyukun/scripts/token.pkl')

    return client
Beispiel #13
0
def login(account, password):
    client = ZhihuClient()
    try:
        client.load_token(TOKEN_FILE)
    except FileNotFoundError:
        try:
            client.login(account, password)
        except NeedCaptchaException:
            # 保存验证码并提示输入,重新登录
            with open('./captcha/a.gif', 'wb') as f:
                f.write(client.get_captcha())
            captcha = input('please input captcha:')
            client.login(account, password, captcha)
            client.save_token('./token/token.pkl')
    finally:
        return client
class ZhihuClientClassTest(unittest.TestCase):
    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile('token.pkl'):
            os.chdir('..')

        if not os.path.isfile('test/token.pkl'):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token('test/token.pkl')
        except ValueError:
            print('\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')
class ZhihuClientClassTest(unittest.TestCase):
    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile('token.pkl'):
            os.chdir('..')

        if not os.path.isfile('test/token.pkl'):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token('test/token.pkl')
        except ValueError:
            print(
                '\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')
Beispiel #16
0
class ZhihuClientClassTest(unittest.TestCase):
    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile(TOKEN_FILE_NAME):
            os.chdir('..')

        token_file_path = os.path.join('test', TOKEN_FILE_NAME)

        if not os.path.isfile(token_file_path):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token(token_file_path)
        except ValueError:
            print('\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')
Beispiel #17
0
def download(uid):
    client = ZhihuClient()
    client.load_token('token.pk1')  # 登陆
    people = client.people(uid)

    for index, answer in enumerate(people.answers):
        # 下载图片
        downloadImg(answer.content)

        # 处理文本
        article = dealArticle(answer)

        # 写入本地
        if not os.path.exists(uid):
            os.makedirs(uid)

        with open(os.path.join(uid, article['title']), 'w+') as f:
            f.write(article['content'])

        print('[%s] download %s success!' %
              (str(index), answer.question.title))
Beispiel #18
0
class ZhihuClientClassTest(unittest.TestCase):
    def setUp(self):
        super(ZhihuClientClassTest, self).setUp()

        if not os.path.isdir('test') and os.path.isfile(TOKEN_FILE_NAME):
            os.chdir('..')

        token_file_path = os.path.join('test', TOKEN_FILE_NAME)

        if not os.path.isfile(token_file_path):
            print('\nno token file, skip all tests.')
            self.skipTest('no token file.')

        self.client = ZhihuClient()

        try:
            self.client.load_token(token_file_path)
        except ValueError:
            print(
                '\ntoken version not math python version, skip all tests.')
            self.skipTest('token version not math python version.')
from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ========================我========================
me = client.me()
# print('活动', me.activities)
# print('答案数', me.answer_count)
# print('答案', me.answers)
# print('文章', me.articles)
# print('文章数', me.articles_count)
# print('头像地址', me.avatar_url)
# print('用户所在行业', me.business)
# print('收藏数', me.collected_count)
# print('收藏夹数', me.collection_count)
# print('收藏夹', me.collections)
# print('专栏数', me.column_count)
# print('专栏', me.columns)
# print('专栏数', me.columns_count)
# created_at = time.localtime(me.created_at)
# print('创建时间', time.strftime("%Y-%m-%d %H:%M:%S", created_at))
# print('个人描述', me.description)
# print('草稿数', me.draft_count)
# print('教育信息', me.educations)
Beispiel #20
0
from zhihu_oauth import ZhihuClient
import datetime
import time
import random
import sys
from timeout import timeout
import os
from utils import print_err
from pymongo import MongoClient

MAX_SLEEP_TIME = 15
Cookies_File = './cookies/cookies%s.json' % sys.argv[1]
global client
client = ZhihuClient()
if os.path.isfile(Cookies_File):
    client.load_token(Cookies_File)
else:
    client_info = open('./cookies/client_info_list.data').readlines()
    client_info = client_info[int(sys.argv[1])].strip().split('\t')
    client.login_in_terminal(client_info[0], client_info[1])
    client.save_token(Cookies_File)


def get_user_questions(uname):
    global client
    if uname == '':
        return
    print(uname)

    user_questions = dict()
    try:
Beispiel #21
0
# coding=utf-8

from __future__ import unicode_literals, print_function

import os

from zhihu_oauth import ZhihuClient


TOKEN_FILE = 'ZHIHUTOKEN.pkl'


client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)
Beispiel #22
0
from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ========================我========================
me = client.me()
# print('活动', me.activities)
# print('答案数', me.answer_count)
# print('答案', me.answers)
# print('文章', me.articles)
# print('文章数', me.articles_count)
# print('头像地址', me.avatar_url)
# print('用户所在行业', me.business)
# print('收藏数', me.collected_count)
# print('收藏夹数', me.collection_count)
# print('收藏夹', me.collections)
# print('专栏数', me.column_count)
# print('专栏', me.columns)
# print('专栏数', me.columns_count)
# created_at = time.localtime(me.created_at)
# print('创建时间', time.strftime("%Y-%m-%d %H:%M:%S", created_at))
# print('个人描述', me.description)
# print('草稿数', me.draft_count)
# print('教育信息', me.educations)
Beispiel #23
0
# @Time   : 2017/5/3 14:27
# @Author : Lyrichu
# @Email  : [email protected]
# @File   : save_images.py
'''
@Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function  # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib.request

client = ZhihuClient()
# 登录
client.load_token('token.pkl')  # 加载token文件
id = 24400664  # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验)
question = client.question(id)
print(u"问题:", question.title)
print(u"回答数量:", question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title + u"(图片)")
path = question.title + u"(图片)"
index = 1  # 图片序号
for answer in question.answers:
    content = answer.content  # 回答内容
    re_compile = re.compile(
        r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
    img_lists = re.findall(re_compile, content)
    if (img_lists):
        for img in img_lists:
Beispiel #24
0
#!/usr/bin/env python
# coding: utf-8
import os
from zhihu_oauth import ZhihuClient, ActType, People
from zhihu_oauth.exception import NeedCaptchaException
from zhihu_oauth.helpers import ts2str, act2str

token = './XXX.pk1'
client = ZhihuClient()

try:
    if os.path.exists(token):
        client.load_token(token)
    else:
        client.login('username', 'passwd')
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = raw_input('please input captcha:')
    client.login('username', 'passwd', captcha)
client.save_token(token)


def dump_activities(pid):
    person = client.people(pid)
    filter_types = {
        ActType.COLLECT_ANSWER,
        ActType.COLLECT_ANSWER,
        ActType.COLLECT_ARTICLE,
        ActType.CREATE_ANSWER,
Beispiel #25
0
reload(sys)
sys.setdefaultencoding('utf-8') # 强制使用utf-8编码

from zhihu_oauth  import  ZhihuClient

from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

test_email = '*****@*****.**'
test_password = '******'
token_file = './token.pkl'

if os.path.lexists(token_file):
    client.load_token(token_file)
    print 'load token success'
else:
    try:
        login_result = client.login(test_email, test_password)
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'
Beispiel #26
0
def login():
    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)

    """
    me = client.me()
    print('name', me.name)
    print('headline', me.headline)
    print('description', me.description)

    print('following topic count', me.following_topic_count)
    print('following people count', me.following_topic_count)
    print('followers count', me.follower_count)

    print('voteup count', me.voteup_count)
    print('get thanks count', me.thanked_count)

    print('answered question', me.answer_count)
    print('question asked', me.question_count)
    print('collection count', me.collection_count)
    print('article count', me.articles_count)
    print('following column count', me.following_column_count)

    # 获取最近 5 个回答
    for _, answer in zip(range(5), me.answers):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取点赞量最高的 5 个回答
    for _, answer in zip(range(5), me.answers.order_by('votenum')):
        print(answer.question.title, answer.voteup_count)

    print('----------')

    # 获取最近提的 5 个问题
    for _, question in zip(range(5), me.questions):
        print(question.title, question.answer_count)

    print('----------')

    # 获取最近发表的 5 个文章
    for _, article in zip(range(5), me.articles):
        print(article.title, article.voteup_count)
    """
    topic = client.topic(19560072)  # 转基因
    # topic = client.topic(19578906)  # 气候变化
    # topic = client.topic(19551296)  # 网络游戏

    answers_count = 0
    for question in topic.unanswered_questions:
        print(question.id)
        print(question.title)
        print(question.answer_count)
        answers_count += question.answer_count
        for answer in question.answers:
            print(answer.author.id,answer.author.name)
            answer.save('Data\\Gene\\'+str(question.id)+'#'+question.title, str(answer.author.id)+'#'+answer.author.name)
    print("总共有{0}个回答".format(answers_count))
Beispiel #27
0
reload(sys)
sys.setdefaultencoding('utf-8')  # 强制使用utf-8编码

from zhihu_oauth import ZhihuClient

from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()

test_email = '*****@*****.**'
test_password = '******'
token_file = './token.pkl'

if os.path.lexists(token_file):
    client.load_token(token_file)
    print 'load token success'
else:
    try:
        login_result = client.login(test_email, test_password)
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'
Beispiel #28
0
class Crawl:
    def __init__(self):
        self.client = ZhihuClient()

    def login(self, username, password):
        if os.path.isfile('app/Resource/' + username + '.token'):
            self.client.load_token('app/Resource/' + username + '.token')
        else:
            try:
                self.client.login(username, password)
            except NeedCaptchaException:
                # 保存验证码并提示输入,重新登录
                with open('a.gif', 'wb') as f:
                    f.write(self.client.get_captcha())
                captcha = input('please input captcha:')
                self.client.login(username, password, captcha)
            self.client.save_token('app/Resource/' + username + '.token')

    def get_live_list(self):
        lives = self.client.me().lives
        return lives

    @staticmethod
    def save_live_list(livedata):
        new_live = MyLive(live_id=livedata.id,
                          title=livedata.title,
                          speaker=livedata.speaker.name,
                          speaker_description=livedata.speaker.description,
                          live_description=livedata.description,
                          seats_count=livedata.seat_taken,
                          price=livedata.fee)
        new_live.save()

    def live_list_work(self):
        for live in self.get_live_list():
            exist = MyLive.objects(live_id=live.id)
            if not exist:
                self.save_live_list(live)

    def get_live_content(self, live_id, before_id=''):
        res = self.client._session.get(
            LIVECONTENT_URL.format(live_id, before_id))
        data = json.loads(res.content)
        return data

    def save_live_content_image(self, id, url):
        content = self.client._session.get(url).content
        file = 'app/Resource/' + str(id) + '.png'
        with open(file, 'wb') as f:
            f.write(content)

    @staticmethod
    def save_live_content(live_id, livedata):
        for r in livedata['data']:
            exist = LiveContent.objects(message_id=r['id'])
            if exist:
                continue

            if r['type'] == 'audio':
                url = r['audio']['url']
            elif r['type'] == 'image':
                url = r['image']['full']['url']

            else:
                url = ''
            content = r['text'] if 'text' in r else ''
            reply = ','.join(r['replies']) if 'replies' in r else ''

            new_live_content = LiveContent(
                message_id=int(r['id']),
                sender=r['sender']['member']['name'],
                type=r['type'],
                content=content,
                url=url,
                reply=reply,
                likes=r['likes']['count'],
                created_at=datetime.fromtimestamp((r['created_at'])),
                live_title=live_id)
            new_live_content.save()

    def live_content_work(self, id):
        live = MyLive.objects(id=id).first()
        # 使用知乎的live的ID值传入获取详情
        data = self.get_live_content(live.live_id)
        while data['unload_count'] > 0:
            # 存储时使用mongo的ID值传入
            self.save_live_content(live.id, data)
            data = self.get_live_content(live.live_id, data['data'][0]['id'])
        else:
            print('success')

        image_contents = LiveContent.objects(live_title=live.id, type='image')
        for item in image_contents:
            self.save_live_content_image(item.id, item.url)
Beispiel #29
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import logging

from logging import StreamHandler
from flask import Flask, jsonify, redirect
from flask_cache import Cache
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('token.pkl')

me = client.me()

cache = Cache(config={'CACHE_TYPE': 'simple'})

app = Flask(__name__)
cache.init_app(app)

file_handler = StreamHandler()
app.logger.setLevel(logging.DEBUG)
app.logger.addHandler(file_handler)


@app.route('/', methods=['GET'])
def index_route():
    return jsonify({
        'author': 'knarfeh',
        'author_url': 'http://www.knarfeh.com',
        'people': 'http://zhihu-api.knarfeh.com/people/<people_id>',
Beispiel #30
0
class ZhiHu(object):
    TOKEN_FILE = 'token.pkl'

    def __init__(self):
        """
        初始化
        """
        self.login_zhihu()
        self.db = EasySqlite('zhihu.db')

    def login_zhihu(self):
        """
        登录知乎
        :return:
        """
        self.client = ZhihuClient()
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def save_quesions(self, topic_id):
        """
        保存话题下的问题
        :param topic_id:
        :return:
        """
        topic = self.client.topic(topic_id)
        print(topic)
        questions = topic.unanswered_questions
        sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        for question in questions:
            if question.answer_count < 10:
                continue
            row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
                   topic_id]
            print(row)
            ret = self.db.update(sql_tmp, args=row)
            if not ret:
                print('insert error!')
            else:
                print('insert success!')

    def save_answer_info(self, question_id):
        """
        保存指定问题的答案概况
        :param question_id:
        :return:
        """
        question = self.client.question(question_id)
        print(question.title)
        answers = question.answers
        for answer in answers:
            print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count,
                  answer.voteup_count)
            answer.save()
            break
        # sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        # for question in questions:
        #     if question.answer_count < 10:
        #         continue
        #     row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
        #            topic_id]
        #     print(row)
        #     ret = self.db.update(sql_tmp, args=row)
        #     if not ret:
        #         print('insert error!')
        #     else:
        #         print('insert success!')

    def to_md(self, topic, file_name):
        sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic
        ret = self.db.query(sql)
        line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数:%s 回答数:%s 评论数:%s<br>\n"
        i = 1
        with open(file_name, 'w', encoding='utf8') as f:
            for item in ret:
                line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count'])
                f.write(line)
                i += 1
Beispiel #31
0
import json
import datetime

from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException

client = ZhihuClient()
client.load_token('/home/wz/ZheProject/Zhihu/Samples/token.pkl')
# try:
#     client.login('*****@*****.**', 'xiao3224')
# except NeedCaptchaException:
#     # 保存验证码并提示输入,重新登录
#     with open('./a.gif', 'wb') as f:
#         f.write(client.get_captcha())
#     captcha = input('please input captcha:')
#     client.login('*****@*****.**', 'xiao3224', captcha)
#     client.save_token('./token.pkl')

res = client.test_api(
    method="GET",
    url="https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total")
data = res.json()['data']
now = datetime.datetime.now().isoformat()
with open('/home/wz/ZheProject/Zhihu/Data/Hotlist_%s.json' % now, 'w') as f:
    json.dump(data, f)
Beispiel #32
0
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('token.pkl')
# replace it  as user input
user = client.people('SakuraNekoq')

# Obtain the mapping
print('business', user.business.name)
print('locations', user.locations[0].name)
Beispiel #33
0
# coding=utf-8
# https://pypi.org/project/zhihu-oauth

from __future__ import unicode_literals, print_function

import os

from zhihu_oauth import ZhihuClient
from zhihu_oauth import SearchType

TOKEN_FILE = 'token.pkl'

client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

me = client.me()

print('name', me.name)
print('headline', me.headline)
print('description', me.description)

print('following topic count', me.following_topic_count)
print('following people count', me.following_topic_count)
print('followers count', me.follower_count)

print('voteup count', me.voteup_count)
class zhihuspider(basespider):
    def __init__(self):
        super().loadConfig()
        super().prepare()
        self.loadConfig()
        self.prepare()
        self.login()

    def loadConfig(self):
        self.config = self.allConfig['zhihu']

        self.data_path = self.socialRoot + self.config['data_path']
        self.TOKEN_FILE = self.data_path + self.config['TOKEN_FILE']
        self.friends_file = self.data_path + self.config['friends_file']

        self.url_template_question = "https://www.zhihu.com/question/%s"
        self.url_template_answer = "https://www.zhihu.com/question/%s/answer/%s"
        self.url_template_article = "https://zhuanlan.zhihu.com/p/%s"

    def prepare(self):
        if not os.path.isdir(self.data_path): os.makedirs(self.data_path)

        if os.path.isfile(self.friends_file):
            with open(self.friends_file, "rb") as f:
                self.name_map = pickle.load(f)
        else:
            self.name_map = dict()

        self.client = ZhihuClient()

    def login(self):
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

        self.me = self.client.me()
        if self.me.over:
            logging.error("login failed! Reason is " + self.me.over_reason)
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def followings2name_map(self, me):
        for peo in me.followings:
            self.name_map[peo.name] = peo.id
        with open(self.friends_file, "wb") as f:
            pickle.dump(self.name_map, f)

    def getActivities(self,
                      userid,
                      count=10,
                      timeOldest=None,
                      timeLatest=None):
        """
		关于actionType
			CREATE_ANSWER
			CREATE_ARTICLE
			CREATE_QUESTION
			FOLLOW_QUESTION
			VOTEUP_ANSWER
		"""
        def getTargetText_Topic(target, actType):
            if isinstance(target, zhihu_oauth.Answer):
                return (target.content, target.question.topics,
                        self.url_template_answer %
                        (target.question.id, target.id))
            elif isinstance(target, zhihu_oauth.Question):
                return (target.detail, target.topics,
                        self.url_template_question % (target.id))
            elif isinstance(target, zhihu_oauth.Article):
                return (target.content, [],
                        self.url_template_article % (target.id))
            else:
                return ("", [], "")

        if isinstance(userid, int): userid = str(userid)
        backuserid = userid
        dtLatest = datetime.datetime(*timeLatest[0:6]) if timeLatest else None
        dtOldest = datetime.datetime(*timeOldest[0:6]) if timeOldest else None

        pp = self.client.people(userid)
        if pp.over:
            if userid not in self.name_map:
                try:
                    self.followings2name_map(self.me)
                except Exception as e:
                    logging.error(str(e))
            if userid in self.name_map:
                userid = self.name_map[userid]
                pp = self.client.people(userid)
            if pp.over: return []

        activityList = []

        cnt = 0
        for act in pp.activities:
            try:
                targetInfo = getTargetText_Topic(act.target, act.type)
                entry = {
                    'username': pp.name,
                    'avatar_url': pp.avatar_url,
                    'headline': pp.headline,
                    'time': time.localtime(act.created_time),
                    'actionType': act.type,
                    'summary': act2str(act),
                    'targetText': targetInfo[0],
                    'topics': list(map(lambda topic: topic.name,
                                       targetInfo[1])),
                    'source_url': targetInfo[2]
                }

                imglist = re.findall(r'(?<=<img src=")(.*?)(?=")',
                                     entry['targetText'])
                if isinstance(act.target,
                              zhihu_oauth.Article) and act.target.image_url:
                    imglist[0:0] = [act.target.image_url]
                if imglist: entry['imgs'] = imglist

                dt = datetime.datetime(*entry['time'][0:6])
                if dtLatest and dtLatest < dt: continue
                if dtOldest and dtOldest > dt: break
                activityList.append(entry)

                cnt += 1
                if cnt >= count: break
            except Exception as e:
                logging.error("getActivities of " + backuserid + " failed")
                traceback.print_exc()

        return activityList