forked from lyyyuna/zhihu_lundaiguang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhihuclient.py
120 lines (107 loc) · 4.31 KB
/
zhihuclient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from httpclient import HttpClient
import asyncio
from bs4 import BeautifulSoup
import keywords
import config
import html
import os
ZHIHU_URL = 'https://www.zhihu.com'
LOGIN_URL = ZHIHU_URL + '/login/email'
VCZH_URL = ZHIHU_URL + '/people/excited-vczh'
class ZhihuClient():
def __init__(self, client, email, password):
self._session = client
self._client = HttpClient(client)
self._email = email
self._password = password
self._finish = False
self._commenttime = '1970'
self._imgurl = asyncio.Queue()
async def _login(self):
data = {'email': self._email, 'password': self._password, 'remember_me': 'true'}
dic = await self._client.post_json(LOGIN_URL, data=data)
await self._client.get(ZHIHU_URL)
self._xsrf = self._session.cookies['_xsrf'].value
print (dic['msg'])
print (self._xsrf)
async def crawl_voteup_answer(self):
await self._login()
feed_num = 20
start = '0'
api_url = VCZH_URL + '/activities'
while feed_num == 20:
data = {'_xsrf':self._xsrf, 'start':start}
dic = await self._client.post_json(api_url, data=data)
if dic == None:
print ('获取更多状态网络错误')
continue
feed_num = dic['msg'][0]
soup = BeautifulSoup(dic['msg'][1], 'html.parser')
acts = soup.find_all('div', class_='zm-profile-section-item zm-item clearfix')
start = acts[-1]['data-time'] if len(acts) > 0 else 0
for act in acts:
# 查看所有的赞,其他“回答、关注”忽略
if act.attrs['data-type-detail'] != "member_voteup_answer":
continue
# 获取评论的链接
comment_div = act.find_all('div', class_='zm-item-answer')
if comment_div == []:
continue
comment_link = comment_div[0]['data-aid']
comment_url = ZHIHU_URL + '/r/answers/' + comment_link + '/comments'
# 根据评论里的关键字判断出是否有目标
HIT = await self._analyze_comments(comment_url)
await asyncio.sleep(config.comment_interval)
if HIT == False:
continue
# 获取回答
answer = act.find_all('textarea', class_='content hidden')
if answer == []:
continue
answer[0] = html.unescape(answer[0].get_text())
# 从回答中找出图片链接
soup2 = BeautifulSoup(answer[0], 'html.parser')
img_urls = soup2.find_all('img')
for img in img_urls:
await self._imgurl.put(img.attrs['src'])
asyncio.sleep(config.more_interval)
print ('more...')
await self._imgurl.put('the end')
async def _analyze_comments(self, url):
dic = await self._client.get_json(url)
if dic == None:
print ('获取评论网络错误')
return False
data = dic['data']
count = 0
for comment in data:
self._commenttime = comment['createdTime']
for keyword in keywords.keywords:
if comment['content'].find(keyword) != -1:
count += 1
if count >= 2:
return True
return False
async def download_image(self):
if not os.path.exists('img'):
os.makedirs('img')
self._count = 1
while True:
url = await self._imgurl.get()
if url == 'the end':
print ('下载完毕')
self._finish = True
break
print ('正在下载第 %s 张图片。。。' % self._count)
await self._client.downloadfile(url, './img/' + str(self._count) + '.jpg')
await asyncio.sleep(config.img_interval)
self._count += 1
async def monitor(self):
while True:
if self._finish == True:
break
print ()
print ('目前下载队列还有:%s 个。' % self._imgurl.qsize())
print ('大概分析到的赞的时间:' + self._commenttime)
print ()
await asyncio.sleep(20)