Beispiel #1
0
def LoginZhihuClient(token_name):
    TOKEN_FILE = 'liuximing.pkl'
    client = ZhihuClient()
    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)
    me = client.me()
    return me
Beispiel #2
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #3
0
def zhihu_login():
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login('*****@*****.**', 'a4906639')
        client.save_token(TOKEN_FILE)
    me = client.me()
    print(me.name)
    return client
Beispiel #4
0
import os

from zhihu_oauth import ZhihuClient
from zhihu_oauth import SearchType

TOKEN_FILE = 'token.pkl'

client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

me = client.me()

print('name', me.name)
print('headline', me.headline)
print('description', me.description)

print('following topic count', me.following_topic_count)
print('following people count', me.following_topic_count)
print('followers count', me.follower_count)

print('voteup count', me.voteup_count)
print('get thanks count', me.thanked_count)

print('answered question', me.answer_count)
print('question asked', me.question_count)
print('collection count', me.collection_count)
from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ========================我========================
me = client.me()
# print('活动', me.activities)
# print('答案数', me.answer_count)
# print('答案', me.answers)
# print('文章', me.articles)
# print('文章数', me.articles_count)
# print('头像地址', me.avatar_url)
# print('用户所在行业', me.business)
# print('收藏数', me.collected_count)
# print('收藏夹数', me.collection_count)
# print('收藏夹', me.collections)
# print('专栏数', me.column_count)
# print('专栏', me.columns)
# print('专栏数', me.columns_count)
# created_at = time.localtime(me.created_at)
# print('创建时间', time.strftime("%Y-%m-%d %H:%M:%S", created_at))
# print('个人描述', me.description)
# print('草稿数', me.draft_count)
# print('教育信息', me.educations)
Beispiel #6
0
class Crawl:
    def __init__(self):
        self.client = ZhihuClient()

    def login(self, username, password):
        if os.path.isfile('app/Resource/' + username + '.token'):
            self.client.load_token('app/Resource/' + username + '.token')
        else:
            try:
                self.client.login(username, password)
            except NeedCaptchaException:
                # 保存验证码并提示输入,重新登录
                with open('a.gif', 'wb') as f:
                    f.write(self.client.get_captcha())
                captcha = input('please input captcha:')
                self.client.login(username, password, captcha)
            self.client.save_token('app/Resource/' + username + '.token')

    def get_live_list(self):
        lives = self.client.me().lives
        return lives

    @staticmethod
    def save_live_list(livedata):
        new_live = MyLive(live_id=livedata.id,
                          title=livedata.title,
                          speaker=livedata.speaker.name,
                          speaker_description=livedata.speaker.description,
                          live_description=livedata.description,
                          seats_count=livedata.seat_taken,
                          price=livedata.fee)
        new_live.save()

    def live_list_work(self):
        for live in self.get_live_list():
            exist = MyLive.objects(live_id=live.id)
            if not exist:
                self.save_live_list(live)

    def get_live_content(self, live_id, before_id=''):
        res = self.client._session.get(
            LIVECONTENT_URL.format(live_id, before_id))
        data = json.loads(res.content)
        return data

    def save_live_content_image(self, id, url):
        content = self.client._session.get(url).content
        file = 'app/Resource/' + str(id) + '.png'
        with open(file, 'wb') as f:
            f.write(content)

    @staticmethod
    def save_live_content(live_id, livedata):
        for r in livedata['data']:
            exist = LiveContent.objects(message_id=r['id'])
            if exist:
                continue

            if r['type'] == 'audio':
                url = r['audio']['url']
            elif r['type'] == 'image':
                url = r['image']['full']['url']

            else:
                url = ''
            content = r['text'] if 'text' in r else ''
            reply = ','.join(r['replies']) if 'replies' in r else ''

            new_live_content = LiveContent(
                message_id=int(r['id']),
                sender=r['sender']['member']['name'],
                type=r['type'],
                content=content,
                url=url,
                reply=reply,
                likes=r['likes']['count'],
                created_at=datetime.fromtimestamp((r['created_at'])),
                live_title=live_id)
            new_live_content.save()

    def live_content_work(self, id):
        live = MyLive.objects(id=id).first()
        # 使用知乎的live的ID值传入获取详情
        data = self.get_live_content(live.live_id)
        while data['unload_count'] > 0:
            # 存储时使用mongo的ID值传入
            self.save_live_content(live.id, data)
            data = self.get_live_content(live.live_id, data['data'][0]['id'])
        else:
            print('success')

        image_contents = LiveContent.objects(live_title=live.id, type='image')
        for item in image_contents:
            self.save_live_content_image(item.id, item.url)
class zhihuspider(basespider):
    def __init__(self):
        super().loadConfig()
        super().prepare()
        self.loadConfig()
        self.prepare()
        self.login()

    def loadConfig(self):
        self.config = self.allConfig['zhihu']

        self.data_path = self.socialRoot + self.config['data_path']
        self.TOKEN_FILE = self.data_path + self.config['TOKEN_FILE']
        self.friends_file = self.data_path + self.config['friends_file']

        self.url_template_question = "https://www.zhihu.com/question/%s"
        self.url_template_answer = "https://www.zhihu.com/question/%s/answer/%s"
        self.url_template_article = "https://zhuanlan.zhihu.com/p/%s"

    def prepare(self):
        if not os.path.isdir(self.data_path): os.makedirs(self.data_path)

        if os.path.isfile(self.friends_file):
            with open(self.friends_file, "rb") as f:
                self.name_map = pickle.load(f)
        else:
            self.name_map = dict()

        self.client = ZhihuClient()

    def login(self):
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

        self.me = self.client.me()
        if self.me.over:
            logging.error("login failed! Reason is " + self.me.over_reason)
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def followings2name_map(self, me):
        for peo in me.followings:
            self.name_map[peo.name] = peo.id
        with open(self.friends_file, "wb") as f:
            pickle.dump(self.name_map, f)

    def getActivities(self,
                      userid,
                      count=10,
                      timeOldest=None,
                      timeLatest=None):
        """
		关于actionType
			CREATE_ANSWER
			CREATE_ARTICLE
			CREATE_QUESTION
			FOLLOW_QUESTION
			VOTEUP_ANSWER
		"""
        def getTargetText_Topic(target, actType):
            if isinstance(target, zhihu_oauth.Answer):
                return (target.content, target.question.topics,
                        self.url_template_answer %
                        (target.question.id, target.id))
            elif isinstance(target, zhihu_oauth.Question):
                return (target.detail, target.topics,
                        self.url_template_question % (target.id))
            elif isinstance(target, zhihu_oauth.Article):
                return (target.content, [],
                        self.url_template_article % (target.id))
            else:
                return ("", [], "")

        if isinstance(userid, int): userid = str(userid)
        backuserid = userid
        dtLatest = datetime.datetime(*timeLatest[0:6]) if timeLatest else None
        dtOldest = datetime.datetime(*timeOldest[0:6]) if timeOldest else None

        pp = self.client.people(userid)
        if pp.over:
            if userid not in self.name_map:
                try:
                    self.followings2name_map(self.me)
                except Exception as e:
                    logging.error(str(e))
            if userid in self.name_map:
                userid = self.name_map[userid]
                pp = self.client.people(userid)
            if pp.over: return []

        activityList = []

        cnt = 0
        for act in pp.activities:
            try:
                targetInfo = getTargetText_Topic(act.target, act.type)
                entry = {
                    'username': pp.name,
                    'avatar_url': pp.avatar_url,
                    'headline': pp.headline,
                    'time': time.localtime(act.created_time),
                    'actionType': act.type,
                    'summary': act2str(act),
                    'targetText': targetInfo[0],
                    'topics': list(map(lambda topic: topic.name,
                                       targetInfo[1])),
                    'source_url': targetInfo[2]
                }

                imglist = re.findall(r'(?<=<img src=")(.*?)(?=")',
                                     entry['targetText'])
                if isinstance(act.target,
                              zhihu_oauth.Article) and act.target.image_url:
                    imglist[0:0] = [act.target.image_url]
                if imglist: entry['imgs'] = imglist

                dt = datetime.datetime(*entry['time'][0:6])
                if dtLatest and dtLatest < dt: continue
                if dtOldest and dtOldest > dt: break
                activityList.append(entry)

                cnt += 1
                if cnt >= count: break
            except Exception as e:
                logging.error("getActivities of " + backuserid + " failed")
                traceback.print_exc()

        return activityList