Beispiel #1
0
def main():
    name = 'xxx'
    type = 'people'
    #search = Search()
    #search.get_person_detail(type,name)
    userid = 'dong-yuan-18'
    user = User(userid)
    user.get_user_update()
Beispiel #2
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #3
0
 def __init__(self, user_uuid, layer):
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, ""),
                          user.get_followees()) if layer < 3 else []
     # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers())
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Beispiel #4
0
def crawler(user_info):
    url = "http://www.zhihu.com/people/" + user_info['id']

    try:
        user = User(url)

        proxy = 'None'
        #proxy = proxy_apply()
        user_agent = random.choice(user_agent_pool)

        start_time = "beginning"
        if len(user_info['activity']) != 0:
            start_time = user_info['activity'][len(user_info['activity']) - 1]

        print(
            termcolor.colored("start crawling ", "green") +
            termcolor.colored(url, "blue") + termcolor.colored(
                "\nproxy:" + proxy + '    start from ' + start_time +
                '\n', "green"))

        #爬取知乎用户动态
        for activity in user.get_activities(proxy, user_agent, start_time):
            user_info['activity'].append(activity)

        user_info['error'] = ''
        #print(url + "finished crawling")
        pass_to_writer(user_info)
        #proxy_recycle(proxy)
        amount_of_finished_users.value += 1
        print(
            termcolor.colored(
                url + ' finished    ' + str(amount_of_finished_users.value) +
                "/" + str(amount_of_users.value) + "\n", "green"))

    except ConnectionError as e:
        print(termcolor.colored("Warning: " + str(e), "yellow"))
        print("Reconnecting for " + termcolor.colored(url, "blue") + "\n")
        crawler(user_info)

    except Exception as e:
        user_info['error'] = str(e)
        pass_to_writer(user_info)

        traceback.print_exc()
        print(termcolor.colored("Error: " + str(e), "red"))
        print("skip " + termcolor.colored(url, "blue"))
        print('\n')
Beispiel #5
0
 def __init__(self, user_uuid, layer):
     """
     Agrs:
         user_uuid: the unique id of the user
         layer: the number of hops to reach to this user from the seed user
     """
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(
         lambda x: x.user_url.replace(prefix_people, "").replace(
             prefix_people_http, ""),
         user.get_followees()) if layer < 3 else []
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
def crawl_id():
    input_path = 'data/available_users'
    output_path = 'data/user_name'

    z_id = {}

    zhihu_url = 'http://www.zhihu.com/people/'

    with open(input_path, 'r') as users:
        for user in users:
            user = eval(user)
            z_user = User(zhihu_url + user['id'])
            try:
                z_id[user['index']] = z_user.get_user_id()
                print(user['index'])
            except:
                print(user['index'], z_user[user['index']])

    with open(output_path, 'w') as out:
        out.write(str(z_id))
Beispiel #7
0
    user_collections = user.get_collections()
    for collection in user_collections:
        # 输出每一个收藏夹的名字
        print collection.get_name()
        # 得到该收藏夹下的前十个回答
        top_answers = collection.get_top_i_answers(10)
        # 把答案内容转成txt,markdown
        for answer in top_answers:
            answer.to_txt()
            answer.to_md()


def main():
    url = "http://www.zhihu.com/question/24269892"
    question_test(url)
    answer_url = "http://www.zhihu.com/question/24269892/answer/29960616"
    answer_test(answer_url)
    user_url = "http://www.zhihu.com/people/jixin"
    user_test(user_url)
    collection_url = "http://www.zhihu.com/collection/36750683"
    collection_test(collection_url)
    test()


if __name__ == '__main__':
    user_url = "https://www.zhihu.com/people/BravoMaooo"
    user = User(user_url)
    followers = user.get_followers()
    for follower in followers:
        print(follower.get_user_id())
Beispiel #8
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender  #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #9
0

def insert_data(user):
    time.sleep(random.random() * 10)
    topics = user.get_topics()
    for t in topics:
        ml.lock(increment, t)
        ml.unlock()
        print t


if __name__ == '__main__':
    #main()
    #user_url = 'https://www.zhihu.com/people/excited-vczh'
    user_url = 'https://www.zhihu.com/people/li-tao-40-73'
    user = User(user_url, u'李涛')
    #user = User(user_url, u'vczh')
    #print user.get_topics_num()
    #for i in user.get_topics():
    #    print i.encode('utf-8')

    #'''
    followees = user.get_followees()
    count = 0
    topics = user.get_topics()
    for t in topics:
        if t not in topics_map: topics_map[t] = 1
        else: topics_map[t] += 1
        print t
    trs = []
    for i in followees:
Beispiel #10
0
def main():
    # read wanted user url from users.txt
    lines = [line.rstrip('\n') for line in open("users_example.txt")]

    # get (users)
    users = [User(user_url) for user_url in lines]
    user_ids = [user.get_user_id() for user in users]
    for user_id in user_ids:
        print "user node: " + user_id

    # get (user)-[follow]->(user) relationships
    # here I use followers, since followers are usually fewer than followees
    # following = []
    # for user in users:
    #     print "processing followers of user: "******"follow relationship: " + src + " follows " + dst

    # get (user)-[answer]->(question) relationships
    answers = []
    answerings = []
    questions = defaultdict(int)
    for user in users:
        print "processing answers of user: "******"number of answers: " + str(len(answers))
    print "number of answerings: " + str(len(answerings))

    # get (user)-[ask]->(question) relationships
    asking = []
    for user in users:
        print "processing questions of user: "******"number of asking: " + str(len(asking))

    # filter by intervel
    # a question node should have at least two relaionships( answering or asking)
    questions = {k: v for (k, v) in questions.iteritems() if v > 1}
    for question, num in questions.iteritems():
        print "question: " + question + " is mentioned " + str(num) + " times."

    # prepare data for writing
    id_map = {}
    index = 0
    usersOut = []
    for user in users:
        id_map[user.get_user_id()] = index
        usero = {}
        usero['id'] = index
        usero['Year'] = index
        usero['cYear'] = index
        usero['Type'] = 'User'
        usero['label'] = user.get_user_id()
        follower_num = user.get_followers_num()
        usero['follower_num'] = follower_num
        if follower_num < 1:
            usero['size'] = 1
        else:
            usero['size'] = math.ceil(math.log(follower_num))
            usersOut.append(usero)
            index += 1

    questionOut = []
    for question, v in questions.iteritems():
        id_map[question] = index
        questiono = {}
        questiono['id'] = index
        questiono['Year'] = randint(1, index)
        questiono['cYear'] = questiono['Year']
        questiono['Type'] = 'Question'
        questiono['label'] = question
        questionOut.append(questiono)
        index += 1

    askOut = []
    for ask in asking:
        if ask[1] in questions:
            asko = {}
            asko['Edge Id'] = str(index)
            asko['target'] = id_map[ask[1]]
            asko['source'] = id_map[ask[0]]
            asko['Year'] = id_map[ask[1]]
            askOut.append(asko)
            index += 1

    answerOut = []
    for answering in answerings:
        title = answering[1].get_question().get_title()
        if title in questions:
            answero = {}
            answero['Edge Id'] = str(index)
            answero['target'] = id_map[title]
            answero['source'] = id_map[answering[0]]
            answero['Year'] = id_map[title]
            answerOut.append(answero)
            index += 1

    write_file(usersOut, questionOut, askOut, answerOut)
Beispiel #11
0
def test():
    lines = [line.rstrip('\n') for line in open("users_example.txt")]
    for line in lines:
        u = User(line)
        print u.get_user_id()
Beispiel #12
0
followees_num =user.get_followees_num()
asks_num = user.get_asks_num()
answers_num = user.get_answers_num()
collections_num = user.get_collections_num()
agree_num = user.get_agree_num()
thanks_num = user.get_thanks_num()
followees = user.get_followees()
followers = user.get_followers()
asks = user.get_asks()
answers = user.get_answers()
collections = user.get_collections()
"""
recorder=dict();
userque=deque();
for suser in start_url:
        tuser=User(suser);
        userque.append( tuser );
        recorder[suser[28:]]=tuser.get_user_id();
total=len(userque);
num=total;
flag=False;
#DirectedGraph="{"
while num>=1:
	num-=1;
	user=userque.popleft()
	print user.get_user_id()
	followees_num =user.get_followees_num()
	followees = user.get_followees()
	for i in range(1,followees_num+1):
		cuser=followees.next()
		# when total<=MAXTOT,then cuser will be in userque.
Beispiel #13
0
from sqlalchemy.orm import sessionmaker

if __name__ == "__main__":
    userid = "wonderful-vczh"

    # create db engine
    engine = create_engine("sqlite:///zhihu.db", echo=False)
    dbmodel.Base.metadata.create_all(engine)

    # create a session
    Session = sessionmaker(bind=engine)
    session = Session()
    url_base = "http://www.zhihu.com/people/"
    url = url_base + userid

    zhihu_user = User(url)
    username = zhihu_user.get_user_id().decode("utf8")
    #print username

    db_user = dbmodel.Dbuser(id=userid, name=username)

    # add user
    session.add(db_user)
    session.commit()

    # add answers
    for i, answer in enumerate(zhihu_user.get_answers()):
        print i
        session.add(
            dbmodel.Dbanswer(id=answer.answer_url,
                             upvote=answer.get_upvote(),
Beispiel #14
0
# Python 3.6.1

import requests.utils
import pickle
from http.cookies import SimpleCookie

from zhihu import User
from zhihu import Answer
from zhihu import Account

zhihu = User()
print(zhihu.cookies)

# 用户登录
account = Account()
result = account.login()
print(result)

# 查看用户profile 成功
# profile = zhihu.profile(user_slug="xiaoxiaodouzi")
# print(profile)

# 发送私信 成功
# response = zhihu.send_message(content='TESTMESSAGE', user_slug="xiaoxiaodouzi")
# print(response)

# 关注用户 成功
# response = zhihu.follow(user_slug='SemitLee')
# print(response)

answer = Answer(url="https://www.zhihu.com/question/34401174/answer/389502954")
Beispiel #15
0
def main():

    initial_user_url = "http://www.zhihu.com/people/BigMing"

    url_queue = Queue.Queue()
    url_queue.put(initial_user_url)

    save_pic_dir0 = sys.path[0] + "/pic_female/"
    save_pic_dir1 = sys.path[0] + "/pic_male/"
    save_pic_dir2 = sys.path[0] + "/pic_emale/"

    saved_count_female = 0
    saved_count_male = 0
    saved_count_emale = 0
    visited_url_count = 0
    tried_url_count = 0

    IO_error_count = 0

    limit_count = 1000000000
    count = 0

    flag = True

    least_follower = 1000

    while (flag):

        if url_queue.qsize() > 0:
            current_url = url_queue.get()
            user = User(current_url)

            try:
                print current_url,
                print "     queue_size: ",
                print url_queue.qsize(),
                print "     Saved_size: ",
                print saved_count_male + saved_count_female
                followees = user.get_followees_with_condition(least_follower)

                for followee in followees:

                    tried_url_count += 1
                    print "tried_url_count: " + str(tried_url_count)

                    visited_url_count += 1
                    print "visited_url_count: " + str(visited_url_count)

                    url_queue.put(followee.user_url)

                    try:
                        req = urllib2.Request(followee.user_pic_url)
                        res = urllib2.urlopen(followee.user_pic_url,
                                              timeout=10)
                        pic = res.read()
                        pextention = os.path.splitext(followee.user_pic_url)

                        if platform.system() == 'Windows':
                            pname = followee.user_id.decode(
                                'utf-8', 'ignore').encode('gbk', 'ignore')
                        else:
                            pname = followee.user_id

                        followee_count = followee.user_followers_num

                        if followee.user_gender == 0:
                            p_full_path = save_pic_dir0 + str(
                                saved_count_female +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_female += 1

                        if followee.user_gender == 1:
                            p_full_path = save_pic_dir1 + str(
                                saved_count_male +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_male += 1

                        if followee.user_gender == 2:
                            p_full_path = save_pic_dir2 + str(
                                saved_count_emale +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_emale += 1

                        if followee.user_gender == 3:

                            if followee.get_user_gender() == 0:
                                p_full_path = save_pic_dir0 + str(
                                    saved_count_female +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_female += 1

                            if followee.get_user_gender() == 1:
                                p_full_path = save_pic_dir1 + str(
                                    saved_count_male +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_male += 1

                            if followee.get_user_gender() == 2:
                                p_full_path = save_pic_dir2 + str(
                                    saved_count_emale +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_emale += 1

                        p = open(p_full_path, "wb")
                        p.write(pic)
                        p.close()

                        count += 1
                        print "female: " + str(
                            saved_count_female) + "  " + "male: " + str(
                                saved_count_male) + "  " + "emale: " + str(
                                    saved_count_emale)
                        if count > limit_count:
                            flag = False
                            break
                    except:
                        IO_error_count += 1
                        print "IO error"
                print " "
            except:
                print "why????????????????????"
        else:
            break