-
Notifications
You must be signed in to change notification settings - Fork 0
/
says.py
67 lines (57 loc) · 1.96 KB
/
says.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# encoding: utf-8
from zhihu_oauth import ZhihuClient
import jieba
import sys
import csv
import re
reload(sys)
sys.setdefaultencoding('utf-8')
jieba.load_userdict("dict.txt")
writer = csv.writer(file('says.csv', 'wb'), delimiter='|')
writer.writerow(['sayid', 'ans', 'uid', 'content'])
pattern = re.compile(r'[^a-zA-Z0-9]')
stopwords = None
def load_stopwords():
stopwordStr = open('stopword.txt', 'r').read()
global stopwords
stopwords = stopwordStr.split('\n')
def delete_stopword(tokenizedContent):
return [word for word in tokenizedContent if pattern.search(word) and word not in stopwords]
def login(username, password):
client = ZhihuClient()
client.login_in_terminal(username, password)
return client
def get_says(item, ans):
sayid = item.id
uid = item.author.id
tokenizedContent = jieba.cut(item.content)
noStopwordsContent = delete_stopword(tokenizedContent)
content = ' '.join(noStopwordsContent).encode('utf-8')
writer.writerow([sayid, ans, uid, content])
def get_says_from_comments(item):
for comment in item.comments:
get_says(comment, item.id)
def get_says_from_answers(item):
for answer in item.answers:
get_says(answer, 0)
get_says_from_comments(answer)
def get_says_from_question(question):
sayid = question.id
uid = 0
ans = 0
title = delete_stopword(jieba.cut(question.title))
detail = delete_stopword(jieba.cut(question.detail))
content = ' '.join(title) + ' ' + ' '.join(detail)
content = content.encode('utf-8')
writer.writerow([sayid, ans, uid, content])
get_says_from_comments(question)
get_says_from_answers(question)
if __name__ == '__main__':
username = sys.argv[1]
password = sys.argv[2]
questionID = int(sys.argv[3])
client = login(username, password)
if client:
load_stopwords()
question = client.question(questionID)
get_says_from_question(question)