Beispiel #1
0
def crawling(id):
    #id为问题id
    client = ZhihuClient()
    # 登录
    client.load_token('token.pkl')  # 加载token文件
    question = client.question(id)
    print(u"问题:", question.title)
    print(u"回答数量:", question.answer_count)
    if not os.path.exists(question.title):
        os.mkdir(question.title)
    path = question.title
    index = 1  # 图片序号
    for i, answer in enumerate(question.answers):
        content = answer.content  # 回答内容
        anther = answer.author.name
        re_compile = re.compile(
            r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
        img_lists = re.findall(re_compile, content)
        if (img_lists):
            for img in img_lists:
                img_url = img[0]  # 图片url
                image_name = anther + '_' + str(index) + '.jpg'
                if not os.path.exists(path + '/' + image_name):
                    urllib.request.urlretrieve(img_url,
                                               path + '/' + image_name)
                    print(u"成功保存第%d张图片:%s,当前总进度%.2f%%" %
                          (index, image_name, i / question.answer_count * 100))
                index += 1
        print('第%d个答案爬取完成,当前总进度%.2f%%' % (i, i / question.answer_count * 100))
Beispiel #2
0
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html'  # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
question = client.question(question_id)
data = question.pure_data
response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

response_file_uri = './people_response.html'  # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
people_id = '404-Page-Not-found'
people = client.people(people_id)
for i in people.answers:
    data = i.pure_data
    response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"
Beispiel #3
0
client.load_token('token.pkl')

me = client.me()

# answer = client.answer(94150403)

# print(answer.question.title)
# print(answer.author.name)
# print(answer.voteup_count)
# print(answer.thanks_count)
# print(answer.created_time)
# print(answer.updated_time)

# for voter in answer.voters:
#     print(voter.name, voter.headline)

question_number = [20787350]

for q in question_number:

    index = 0
    question = client.question(q)

    print(question.title)

    for answer in question.answers:
        if index > 666: break
        print(answer.author.name, answer.voteup_count)
        answer.save(question.title)
        index += 1
Beispiel #4
0
class Crawler:
    # Initialize the crawler with the name of database
    def __init__(self, dbname,email,key):
        self.con = sqlite3.connect(dbname)
        self.cursor = self.con.cursor()
        TOKEN_FILE = 'token.pkl'
        self.zhclient = ZhihuClient()
        try:
            # self.zhclient.login_in_terminal(email, key)
            self.zhclient.login(email, key)
        except NeedCaptchaException:
            print("需要输入验证码,账号 %s 可能已失效" %(email))
        # if os.path.isfile(TOKEN_FILE):
        #     self.zhclient.load_token(TOKEN_FILE)
        # else:
        #     self.zhclient.login_in_terminal(email, key)
        #     self.zhclient.save_token(TOKEN_FILE)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    #建立数据表
    def createindextables(self):
        self.cursor.execute('create table userinfo(id primary key NOT NULL ,name text,headline text,gender int,address text,business text,school_name text,job text,company text,answer_count int ,question_count int ,voteup_count int ,thanked_count int ,following_count int ,follower_count int ,following_question_count int ,following_topic_count,collected_count int,identity text,best_topics text,is_organization int,org_name text,org_home_page text,org_industry text,record_time text)')
        self.cursor.execute('create table answerinfo(id primary key NOT NULL,content text,author_id int ,voteup_count int,thanks_count int, created_time text,comment_count int,updated_time text,record_time text)')
        self.cursor.execute('create table questioninfo(id primary key NOT NULL,title text,follower_count int ,answer_count int,created_time text,updated_time text,record_time text)')
        self.cursor.execute('create table topicinfo(id primary key NOT NULL,title text,best_answer_count int ,follower_count int ,question_count int,record_time text)')

        self.cursor.execute('create table topic_questions(topic_id ,topic_name text,question_id ,question_title text,record_time text)')
        self.cursor.execute('create table topic_users(topic_id,topic_name text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_users(question_id,question_title text,user_id,user_name text,record_time text)')
        self.cursor.execute('create table question_answers(question_id,question_title text,answer_id,author_id,record_time text)')
        self.cursor.execute('create table user_users(user_id,user_follower_id)')
        self.cursor.execute('create table question_topics(question_id,topic_id,topic_name text,record_time text)')
        self.cursor.execute('create table user_topics(user_id,user_name text,topic_id,topic_name text,record_time text)')

        self.cursor.execute('create index userinfoidx on userinfo(id)')
        self.cursor.execute('create index answerinfoidx on answerinfo(id)')
        self.cursor.execute('create index questioninfoidx on questioninfo(id)')
        self.cursor.execute('create index topicinfoidx on topicinfo(id)')

        self.cursor.execute('create index topic_questionsidx on topic_questions(topic_id,question_id)')
        self.cursor.execute('create index topic_usersidx on topic_users(topic_id,user_id)')
        self.cursor.execute('create index question_usersidx on question_users(question_id,user_id)')
        self.cursor.execute('create index question_answersidx on question_answers(question_id,answer_id)')
        self.cursor.execute('create index user_usersidx on user_users(user_id,user_follower_id)')
        self.cursor.execute('create index question_topicsidx on question_topics(question_id,topic_id)')
        self.cursor.execute('create index user_topicsidx on user_topics(user_id,topic_id)')

        self.dbcommit()

    # #多线程尝试
    # def crawl_data(self,work_set,table1,field1,table2,field2):
    #     if table2 == "userinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.userinfo(subid)
    #     elif table2 == "answerinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.answerinfo(subid)
    #             # time.sleep(0.8)
    #             # time.sleep(0.5)
    #     elif table2 == "questioninfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.questioninfo(subid)
    #     elif table2 == "topicinfo":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topicinfo(subid)
    #     elif table2 == "question_answers":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_answers(subid)
    #     elif table2 == "question_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_topics(subid)
    #     elif table2 == "question_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.question_users(subid)
    #     elif table2 == "topic_questions":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_questions(subid)
    #     elif table2 == "topic_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.topic_users(subid)
    #     elif table2 == "user_users":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_users(subid)
    #     elif table2 == "user_topics":
    #         for subid in work_set:
    #             subid = subid[0]
    #             self.user_topics(subid)
    #     return None


    def justdoit(self,table1,field1,table2,field2):
        set2 =set(self.cursor.execute("select DISTINCT  {} from {}".format(field2,table2)).fetchall())
        set1 = set(self.cursor.execute("select DISTINCT {} from {}".format(field1,table1)).fetchall())
        work_set = set1-set2
        # work_set = list(set1 - set2)
        # splitlen = int(len(work_set) / 2)
        # subwork_set = [work_set[i:i + splitlen] for i in range(0, len(work_set), splitlen)]
        # threads = []
        # for i in range(0,len(subwork_set)):
        #     t = multiprocessing.Process(target=self.crawl_data,args=(subwork_set[i],table1,field1,table2,field2))
        #     threads.append(t)
        # for t in threads:
        #     t.start()
        #     t.join()
        if table2 == "userinfo":
            for subid in work_set:
                subid = subid[0]
                self.userinfo(subid)
        elif table2 == "answerinfo":
            for subid in work_set:
                subid = subid[0]
                self.answerinfo(subid)
                # time.sleep(1.0)
                time.sleep(0.1)
        elif table2 == "questioninfo":
            for subid in work_set:
                subid = subid[0]
                self.questioninfo(subid)
        elif table2 == "topicinfo":
            for subid in work_set:
                subid = subid[0]
                self.topicinfo(subid)
        elif table2 == "question_answers":
            for subid in work_set:
                subid = subid[0]
                self.question_answers(subid)
        elif table2 == "question_topics":
            for subid in work_set:
                subid = subid[0]
                self.question_topics(subid)
        elif table2 == "question_users":
            for subid in work_set:
                subid = subid[0]
                self.question_users(subid)
        elif table2 == "topic_questions":
            for subid in work_set:
                subid = subid[0]
                self.topic_questions(subid)
        elif table2 == "topic_users":
            for subid in work_set:
                subid = subid[0]
                self.topic_users(subid)
        elif table2 == "user_users":
            for subid in work_set:
                subid = subid[0]
                self.user_users(subid)
        elif table2 == "user_topics":
            for subid in work_set:
                subid = subid[0]
                self.user_topics(subid)
        return None

    #话题-(精华)问题关系
    def topic_questions(self,topic_id):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            ques_set = set()
            for hot_ques in shield(topic.best_answers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_questions", "topic_id", "question_id", topic.id, hot_ques.question.id)
                if status == None:
                    if hot_ques.question.id not in ques_set:
                        ques_set.add(hot_ques.question.id)
                        values = (topic.id,topic.name,hot_ques.question.id,hot_ques.question.title,record_time)
                        self.cursor.execute("insert into topic_questions(topic_id,topic_name,question_id,question_title,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理", hot_ques.question.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            raise
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题-关注者关系
    def topic_users(self,topic_id,start_at = 0):
        try:
            topic = self.zhclient.topic(topic_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(topic.followers,start_at=start_at,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("topic_users", "topic_id", "user_id", topic.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (topic.id,topic.name,follower.id,follower.name,record_time)
                        self.cursor.execute("insert into topic_users(topic_id,topic_name,user_id,user_name,record_time) VALUES (?,?,?,?,?)" ,values)
                        self.dbcommit()
                        print("正在处理",topic.name,follower.name)
                        # time.sleep(0.3)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-关注者关系
    def question_users(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(question.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("question_users", "question_id", "user_id", question.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        values = (question.id, question.title, follower.id, follower.name,record_time)
                        self.cursor.execute(
                            "insert into question_users(question_id,question_title,user_id,user_name,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理",follower.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    # 问题-回答关系
    def question_answers(self, question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            answer_set = set()
            for answer in shield(question.answers):
                status = self.isdupicaterel("question_answers", "question_id", "answer_id", question.id, answer.id)
                if status == None:
                    if answer.id not in answer_set:
                        answer_set.add(answer.id)
                        values = (question.id, question.title, answer.id, answer.author.id,record_time)
                        self.cursor.execute("insert into question_answers(question_id,question_title,answer_id,author_id,record_time) VALUES (?,?,?,?,?)", values)
                        self.dbcommit()
                        print("正在处理", question.id, question.title, answer.id, answer.author.id)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except ZhihuWarning:
            print("Pass the UnexpectedResponseException")
            pass

    #获取用户-用户关注关系,知乎有5020限制,api限制最多获取一个用户5020粉丝
    def user_users(self,user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            user_set = set()
            for follower in shield(people.followers,action=SHIELD_ACTION.PASS):
                status = self.isdupicaterel("user_users", "user_id", "user_follower_id", people.id, follower.id)
                if status == None:
                    if follower.id not in user_set:
                        user_set.add(follower.id)
                        valus = (people.id,follower.id,record_time)
                        self.cursor.execute("insert into user_users(user_id,user_follower_id,record_time) VALUES (?,?,?)",valus)
                        self.dbcommit()
                        print("正在处理",follower.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #获取问题-话题关系
    def question_topics(self,question_id):
        try:
            question = self.zhclient.question(question_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(question.topics):
                status = self.isdupicaterel("question_topics", "question_id", "topic_id", question.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (question.id,topic.id,topic.name,record_time)
                        self.cursor.execute("insert into question_topics(question_id,topic_id,topic_name,record_time) VALUES (?,?,?,?)",values)
                        self.dbcommit()
                        print("正在处理", topic.name,question.title)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 获取用户-话题关系
    def user_topics(self, user_id):
        try:
            people = self.zhclient.people(user_id)
            record_time = self.logtime()
            topic_set = set()
            for topic in shield(people.following_topics):
                status = self.isdupicaterel("user_topics", "user_id", "topic_id", people.id, topic.id)
                if status == None:
                    if topic.id not in topic_set:
                        topic_set.add(topic.id)
                        values = (people.id, people.name, topic.id,topic.name, record_time)
                        self.cursor.execute(
                            "insert into user_topics(user_id,user_name,topic_id,topic_name,record_time) VALUES (?,?,?,?,?)",
                            values)
                        self.dbcommit()
                        print("正在处理", people.name ,topic.name)
                else:
                    print("已存在,正在跳过")
                    pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass

    # 判断数据重复
    def isdupicateid(self, table, id):
        cur = self.cursor.execute(
            "select rowid from {} where id = ?".format(table), (id,))
        self.dbcommit()
        res = cur.fetchone()
        res = None if res == None else res[0]
        return res

    def isdupicaterel(self,table,field1,field2, id1,id2):
        cur = self.cursor.execute(
            "select rowid from {} where {}= ? And {} = ?".format(table,field1,field2), (id1,id2))
        res = cur.fetchone()
        self.dbcommit()
        res = None if res == None else res[0]
        return res

    #个人信息
    def userinfo(self,user_id):
        try:
            status = self.isdupicateid("userinfo",user_id)
            if status==None:
                people = self.zhclient.people(user_id)
                record_time = self.logtime()
                address = "|".join([location.name for location in people.locations])
                school_name = "|".join([education.school.name for education in people.educations if "school" in education])
                job = "|".join([employment.job.name for employment in people.employments if "job" in employment])
                company = "|".join([employment.company.name for employment in people.employments if "company" in employment])
                business = people.business.name if people.business else None
                #勋章判断
                if people.badge.has_identity:
                    identity = people.badge.identity
                else:
                    identity = None
                if people.badge.is_best_answerer:
                    best_topics = "".join([topic.name for topic in people.badge.topics])
                else:
                    best_topics = None
                if people.badge.is_organization:
                    is_organization = 1
                    org_name = people.badge.org_name
                    org_home_page = people.badge.org_home_page
                    org_industry = people.badge.org_industry
                else:
                    is_organization = 0
                    org_name = None
                    org_home_page = None
                    org_industry = None
                values = (
                people.id, people.name, people.headline, people.gender, address, business, school_name, job,company,
                people.answer_count, people.question_count, people.voteup_count, people.thanked_count,
                people.following_count, people.follower_count, people.following_question_count,
                people.following_topic_count, people.collected_count, identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time)
                self.cursor.execute(
                    "insert into userinfo(id,name,headline,gender,address,business,school_name,job,company,answer_count,question_count,voteup_count,thanked_count,following_count,follower_count,following_question_count,following_topic_count,collected_count,identity,best_topics,is_organization,org_name,org_home_page,org_industry,record_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
                    values)
                self.dbcommit()
                print("正在处理", people.name)
            else:
                print("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass

    def answerinfo(self,answer_id):
        try:
            status = self.isdupicateid("answerinfo", answer_id)
            if status == None:
                answer = self.zhclient.answer(answer_id)
                record_time = self.logtime()
                values = (answer.id,answer.content,answer.author.id,answer.voteup_count,answer.thanks_count,answer.comment_count,answer.created_time,answer.updated_time,record_time)
                self.cursor.execute("insert into answerinfo(id,content,author_id,voteup_count,thanks_count,comment_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理",answer.id)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            self.cursor.execute("delete from question_answers where answer_id = ?",(answer_id,))##在从question_answer表中获取及时删除无效问题,方式切换帐号后反复爬去无效问题。
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #问题信息
    def questioninfo(self,question_id):
        try:
            status = self.isdupicateid("questioninfo", question_id)
            if status == None:
                question = self.zhclient.question(question_id)
                record_time = self.logtime()
                values = (question.id,question.title,question.follower_count,question.answer_count,question.created_time,question.updated_time,record_time)
                self.cursor.execute("insert into questioninfo(id,title,follower_count,answer_count,created_time,updated_time,record_time) VALUES (?,?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理" ,question.title)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #话题信息
    def topicinfo(self,topic_id):
        try:
            status = self.isdupicateid("topicinfo", topic_id)
            if status == None:
                topic = self.zhclient.topic(topic_id)
                record_time = self.logtime()
                values=(topic.id,topic.name,topic.best_answer_count,topic.follower_count,topic.question_count,record_time)
                self.cursor.execute("insert into topicinfo(id,title,best_answer_count,follower_count,question_count,record_time) VALUES (?,?,?,?,?,?)",values)
                self.dbcommit()
                print("正在处理", topic.name)
            else:
                return ("重复,rowid",status)
        except GetDataErrorException:
            print("Pass the GetDataErrorException")
            pass
        except UnexpectedResponseException:
            print("Pass the UnexpectedResponseException")
            pass
    #时间戳
    def logtime(self):
        fmt = '%Y-%m-%d'  # 定义时间显示格式
        Date = time.strftime(fmt, time.localtime(time.time()))
        return Date


    def add_counts(self,filepath = "logincounts.txt"):
        counts = []
        for line in open(filepath):
            count = {}
            count["count"], count["key"] = line.split("----")
            count["key"] = count["key"].strip("\n")
            counts.append(count)
        return counts

    def get_proxy(self):
        try:
            PROXY_POOL_URL = 'http://localhost:5000/get'
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                return response.text
        except ConnectionError:
            return None
import pymysql
from zhihu_oauth import ZhihuClient
from getAnswer import getAnswer
from getUser import getUser

# login
TOKEN_FILE = 'token.pkl'
client = ZhihuClient()
if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# 创建问题对象,参数为问题id
question = client.question(67079761)

if __name__ == '__main__':
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******',
                                 port=3306,
                                 db='zhihu_live',
                                 charset='utf8mb4')
    u_table = str(question.id) + '_user'
    a_table = str(question.id) + '_ans'
    cre_utable = 'create table IF NOT EXISTS %s (uid VARCHAR (50),name VARCHAR (20),gender VARCHAR (10),headline VARCHAR (400),description VARCHAR (1000),que_count INT ,ans_count INT ,art_count INT ,column_ INT ,column_fol_sum INT ,collection INT ,coll_ans_sum INT ,coll_fol_sum INT ,voteup INT ,thanks INT ,collected INT ,shared INT ,art_vote_sum INT ,following INT ,follower INT ,fol_column INT ,fol_topic INT ,fol_topic_name MEDIUMTEXT,fol_ques INT ,location VARCHAR (200),business VARCHAR (50),school VARCHAR (200),major VARCHAR (200),company VARCHAR (200),job VARCHAR (200), avatar VARCHAR (10),avatar_url VARCHAR (100),weibo VARCHAR (10),weibo_name VARCHAR (50),weibo_url VARCHAR (50), give_ans_vote INT, give_art_vote INT, ans_id INT ,que_title VARCHAR (200))' % u_table
    ins_utable = 'insert into ' + u_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    cre_atable = 'create TABLE IF NOT EXISTS %s (ans_ques VARCHAR (200),que_id INT ,ans_id INT ,ans_auth VARCHAR (20),ans_cont MEDIUMTEXT ,ans_vote INT ,ans_than INT ,ans_comm INT ,com_perm VARCHAR (20) ,cre_timestamp VARCHAR (30),upd_timestamp VARCHAR (30),cre_time VARCHAR (30),upd_time VARCHAR (30))' % a_table
    ins_atable = 'insert into ' + a_table + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
Beispiel #6
0
# for _ in range(1000):
#     num = random.randint(20000000, 39999999)
for num in [
        40023941, 36582119, 23434853, 37027323, 39124944, 22345285, 26992616,
        28066166, 41035200, 21396519, 35947787, 36851579, 21372989, 47955389,
        37236484, 19861023, 25877081, 27063206, 29166103, 23246914, 38540397,
        36543921, 32158092, 41207814, 41404094, 36734444, 31819473, 29336768,
        32171411, 37184080, 20468104, 36238122, 36573907, 23415802, 30605806,
        37737298, 37059032, 48837193, 48296279, 41053015, 22978737, 22621327,
        42082026, 30470093, 41038770, 21155222, 28489148, 32081129, 32369239,
        30830614, 29213441, 41113819, 36770197, 48831736, 35990525, 48779414,
        22364486, 33032798, 29604768, 21900376, 26500277
]:
    n = 0
    try:
        question = client.question(num)
        # question = client.from_url('https://www.zhihu.com/question/35166763')
        print(question.title)
        with open("name.txt", 'a', encoding='utf-8') as f:
            for answer in question.answers:
                n += 1
                try:
                    if answer.author.name != "匿名用户" and answer.author.name != "[已重置]":
                        print(answer.author.name)
                        f.write(answer.author.name + '\n')
                except:
                    pass
                print(n)
    except:
        print("空")
Beispiel #7
0
    35367500, 35210878, 35134422, 35062190, 35012924, 35004585, 34893663,
    34370944, 34225657, 33488763, 33259890, 32207070, 31592568, 31365240,
    31337752, 30966406, 30158223, 29735498, 29582607, 29550579, 29525971,
    29519716, 29518811, 29511036, 29508808, 29448162, 27255630, 25951351,
    23863606, 19930380
]
qid_run = qid_remaining[200:]

rows = []
fail_qid = []

counter = 1

for qid in qid_run:
    try:
        ans = client.question(int(qid)).answers
        for v in ans:
            rows.append((v._id, qid, v.voteup_count, v.comment_count))
        print("success {0}".format(counter))
        counter += 1
    except:
        # if crawl failed, append qid to list fail_qid[]
        fail_qid.append(qid)
        print("fail {0}".format(counter))
        counter += 1
        print(fail_qid)
        continue

output_file = "./answers.csv"
headers = ["aid", "qid", "voteup_count", "comment_count"]
with open(output_file, 'a') as f:
Beispiel #8
0
    except NeedCaptchaException:
        # 保存验证码并提示输入,重新登录
        print u'登录失败,需要输入验证码'
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = raw_input(u'please input captcha:')
        login_result = client.login(test_email, test_password, captcha)
    print 'login result => '
    print login_result
    client.save_token(token_file)
    print 'save token success'

# question
response_file_uri = './question_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
question_id = 35005800
question = client.question(question_id)
data = question.pure_data
response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"

response_file_uri = './people_response.html' # 将json输出到网页中,chrome下按F12选preview能看见浏览器渲染出的json数据结构
people_id = '404-Page-Not-found'
people = client.people(people_id)
for i in people.answers:
    data = i.pure_data
    response_json = json.dumps(data)
response_file = open(response_file_uri, 'w+')
response_file.write(response_json)
print u"数据保存完成"
Beispiel #9
0
if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
    print('login success!')
else:
    client.login_in_terminal()
    client.save_token(TOKEN_FILE)

# # 回答信息
# answer = client.answer(94150403)
#
# print(answer.question.title)
# print(answer.author.name)
# print(answer.voteup_count)
# print(answer.thanks_count)
# print(answer.created_time)
# print(answer.updated_time)
#
# for voter in answer.voters:
#     print(voter.name, voter.headline)

question = client.question(35166763)

print(question.title)

count = 0
for answer in question.answers:
    answer.save(r'Data\Answers\\' + question.title)
    count += 1
    if count == 10:
        break
Beispiel #10
0
class ZhiHu(object):
    TOKEN_FILE = 'token.pkl'

    def __init__(self):
        """
        初始化
        """
        self.login_zhihu()
        self.db = EasySqlite('zhihu.db')

    def login_zhihu(self):
        """
        登录知乎
        :return:
        """
        self.client = ZhihuClient()
        if os.path.isfile(self.TOKEN_FILE):
            self.client.load_token(self.TOKEN_FILE)
        else:
            self.client.login_in_terminal()
            self.client.save_token(self.TOKEN_FILE)

    def save_quesions(self, topic_id):
        """
        保存话题下的问题
        :param topic_id:
        :return:
        """
        topic = self.client.topic(topic_id)
        print(topic)
        questions = topic.unanswered_questions
        sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        for question in questions:
            if question.answer_count < 10:
                continue
            row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
                   topic_id]
            print(row)
            ret = self.db.update(sql_tmp, args=row)
            if not ret:
                print('insert error!')
            else:
                print('insert success!')

    def save_answer_info(self, question_id):
        """
        保存指定问题的答案概况
        :param question_id:
        :return:
        """
        question = self.client.question(question_id)
        print(question.title)
        answers = question.answers
        for answer in answers:
            print(answer.comment_count, answer.excerpt, answer.question, answer.thanks_count,
                  answer.voteup_count)
            answer.save()
            break
        # sql_tmp = 'replace into questions values(?,?,?,?,?,?)'
        # for question in questions:
        #     if question.answer_count < 10:
        #         continue
        #     row = [question.id, question.title, question.follower_count, question.answer_count, question.comment_count,
        #            topic_id]
        #     print(row)
        #     ret = self.db.update(sql_tmp, args=row)
        #     if not ret:
        #         print('insert error!')
        #     else:
        #         print('insert success!')

    def to_md(self, topic, file_name):
        sql = "select * from questions where topic_id = '%s' order by follower_count desc limit 1000" % topic
        ret = self.db.query(sql)
        line_tmp = "%s. [%s](https://www.zhihu.com/question/%s) 关注数:%s 回答数:%s 评论数:%s<br>\n"
        i = 1
        with open(file_name, 'w', encoding='utf8') as f:
            for item in ret:
                line = line_tmp % (i, item['title'], item['id'], item['follower_count'], item['answer_count'], item['comment_count'])
                f.write(line)
                i += 1
Beispiel #11
0
path = r"D:\Kuangyichen\Repository_py3\Zhihu\Data\Gene"
download = [int(i.split('#')[0]) for i in os.listdir(path)]

out = set(questions) - set(download)

print(len(out))
for i in out:
    print(i)

    TOKEN_FILE = 'token.pkl'
    client = ZhihuClient()

    if os.path.isfile(TOKEN_FILE):
        client.load_token(TOKEN_FILE)
    else:
        client.login_in_terminal()
        client.save_token(TOKEN_FILE)
    path = r'D:\Kuangyichen\Repository_py3\Zhihu\Data\lefted'
    questions = []
    with open(path, 'r', encoding='UTF8') as Reader:
        for line in Reader.readlines():
            questions.append(int(line))
    for q in questions:
        question_t = client.question(q)
        print(str(q) + "start")
        for answer in question_t.answers:
            print(answer.author.id, answer.author.name)
            answer.save(
                'Data\\Gene\\' + str(question_t.id) + '#' + question_t.title,
                str(answer.author.id) + '#' + answer.author.name)
        print(str(q) + "end")
Beispiel #12
0


client = ZhihuClient()
#登录部分
try:
    client.login(ZHIHU_ID, ZHIHU_KEY)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(ZHIHU_ID, ZHIHU_KEY, captcha)


the_question = client.question(QUESTION_ID)


print(the_question.title)



a = 0


with open('all_answers.txt', 'w') as f_txt:
    f_txt.write("昵称&用户关注数&用户粉丝数&用户回答数&用户获赞数&用户获得感谢数&用户性别&用户学校&用户学院&回答内容(去标点符号)&回答日期&最后修改日期&赞数&感谢数&评论数&是否允许评论&是否被建议修改&回答可信度指数&情感分析积极性&情感分析消极性&情感倾向&回答内容")
    for the_answer in the_question.answers:

        the_author = the_answer.author
        author_name = the_author.name
from lxml import html
import requests, time, zhihu_oauth

start_time = time.time()  # 初始时间戳

# ========================登录========================
from zhihu_oauth import ZhihuClient

client = ZhihuClient()
client.load_token('/Users/alicewish/我的坚果云/token.pkl')

# ========================查询问题========================
qid = 48217184
question = client.question(qid)
print('允许删除', question.allow_delete)
print('答案数', question.answer_count)
print('答案', question.answers)
print('评论数', question.comment_count)
print('评论', question.comments)
print('细节', question.detail)
print('摘录', question.excerpt)
print('关注数', question.follower_count)
print('关注人', question.followers)
print('问题ID', question.id)
print('重定向', question.redirection)
print('状态', question.status)
print('建议修改', question.suggest_edit)
print('标题', question.title)
print('话题', question.topics)
print('更新时间', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(question.updated_time)))
Beispiel #14
0
TOKEN_FILE = 'token.pkl'
client = ZhihuClient()

if os.path.isfile(TOKEN_FILE):
    client.load_token(TOKEN_FILE)
else:
    try:
        client.login('email_or_phone', 'password')
    except NeedCaptchaException:
        with open('a.gif', 'wb') as f:
            f.write(client.get_captcha())
        captcha = input('please input captcha:')
        client.login('email_or_phone', 'password', captcha)
    client.save_token(TOKEN_FILE)

question = client.question(int(question_id))
print(question.title)
wb = Workbook()
sheet = wb.active
sheet.title = "知乎"
item_name = [
    'time_now', 'content', 'author', 'gender', 'loc', 'business', 'company',
    'job', 'created_time', 'updated_time', 'voteup_count', 'comment_count',
    'thanks_count'
]
for j, title in enumerate(item_name):
    sheet.cell(row=1, column=j + 1).value = title
num = 0
for answer in question.answers:
    num += 1
    item_data = [datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')]
Beispiel #15
0
# @Email  : [email protected]
# @File   : save_images.py
'''
@Description:保存知乎某个问题下所有答案的图片
'''
from __future__ import print_function  # 使用python3的print方法
from zhihu_oauth import ZhihuClient
import re
import os
import urllib.request

client = ZhihuClient()
# 登录
client.load_token('token.pkl')  # 加载token文件
id = 24400664  # https://www.zhihu.com/question/24400664(长得好看是一种怎么样的体验)
question = client.question(id)
print(u"问题:", question.title)
print(u"回答数量:", question.answer_count)
# 建立存放图片的文件夹
os.mkdir(question.title + u"(图片)")
path = question.title + u"(图片)"
index = 1  # 图片序号
for answer in question.answers:
    content = answer.content  # 回答内容
    re_compile = re.compile(
        r'<img src="(https://pic\d\.zhimg\.com/.*?\.(jpg|png))".*?>')
    img_lists = re.findall(re_compile, content)
    if (img_lists):
        for img in img_lists:
            img_url = img[0]  # 图片url
            urllib.request.urlretrieve(img_url, path + u"/%d.jpg" % index)
Beispiel #16
0
Zhihu = dbClient['Zhihu']
ZhihuData = Zhihu[str(questionID)]
if ZhihuData.find():
        ZhihuData.remove({})
# 登陆知乎账号
client = ZhihuClient()
try:
    client.login(account, passwd)
except NeedCaptchaException:
    # 保存验证码并提示输入,重新登录
    with open('a.gif', 'wb') as f:
        f.write(client.get_captcha())
    captcha = input('please input captcha:')
    client.login(account, passwd, captcha)
# 创建问题对象
question = client.question(questionID)
# 读取问题下所有的回答并保存起来
print(question.title)
count = 0
for answer in question.answers:
    count+=1
    try:
        data = {
            'title':question.title,
            'author':answer.author.name,
            'description':answer.author.description,
            'content':answer.content,
            'voteup':answer.voteup_count,
            'thanks':answer.thanks_count
        }
        print("正在保存第%s个回答" %count)
Beispiel #17
0
# tableNum = mysql.cur.execute("SELECT concat('DROP TABLE IF EXISTS ', table_name, ';') ""FROM information_schema.tables WHERE table_schema = 'urldata';")
# if tableNum == 0:
#     print("该数据库下所有表删除完毕\n---------------------------------------")
# else:
#     print("删除失败")
#
# 在数据库(urldata)中新建表
try:
    mysql.cur.execute(
        "create table answer(answer_id int(4),author_name varchar(40),author_healine varchar(40),agree_num int(4),comment_num int(4),thanks_count int(4), url varchar(100))"
    )
    mysql.cur.execute(
        "create table comments(currentanswer_id int(4),commentID int(4), commentpersonName varchar(40), words varchar(300))"
    )
    # 爬取知乎热榜第一的所有回答//5G发放牌照
    question = client.question(328058110)
    for answer in question.answers:
        try:
            mysql.cur.execute(
                "insert into answer values(%d,'%s','%s',%d,%d,%d,'%s')" %
                (answer.id, answer.author.name, answer.author.headline,
                 answer.voteup_count, answer.comment_count,
                 answer.thanks_count, answer._build_url()))
        except BaseException:
            print("a answer is nelected")
    question = client.question(328058110)
    for answer in question.answers:
        print(answer.pure_data)
    # 爬取知乎热榜所有评论
    question = client.question(328058110)
    for answer in question.answers:
Beispiel #18
0
def time2str(timeStamp):
    timeArray = time.localtime(timeStamp)
    return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)


def ucps2str(ucpstr):
    '''Convert unicode code point (in hex) ascii string to unicode string'''
    s = ''
    for i in range(len(ucpstr) / 4):
        ucp = ucpstr[i * 4:i * 4 + 4]
        s = s + unichr(int(ucp, 16))
    return s


question = client.question(20840874)#哪些东西买了之后,会让人因生活质量和幸福感提升而感觉相见恨晚?

# 通过question类的answers这个生成器属性可以获取到每个回答的author(answer.author类),进而过去回答者的档案信息
# for answer in question.answers:
#     print(answer.author.name+ str(answer.author.answer_count) + ' ' + str(answer.voteup_count))
#     print(answer.author.id)  #为什么id是这样的?10fc5f92b8f7f7cd1a058d10a0f36ce0

# for answer in question.answers:
#     # print('id',ucps2str(answer.author.id),answer.author.id.decode('hex'),binascii.unhexlify(answer.author.id))
#     print('id', answer.author.i)
#     print('uid',answer.author.uid)
#     print('name', answer.author.name)
#     print('gender', answer.author.gender)
#     print('headline', answer.author.headline)
#     print('description', answer.author.description)
#     print('\n')