def user_test(user_url):
    user = User(user_url)
    user_id = user.get_user_id()
    answers = user.get_answers()
    print answers
    for answer in answers:
        print answer.get_question().get_question_id()
Example #2
0
def main():
    name = 'xxx'
    type = 'people'
    #search = Search()
    #search.get_person_detail(type,name)
    userid = 'dong-yuan-18'
    user = User(userid)
    user.get_user_update()
Example #3
0
def main():
    name = 'xxx'
    type ='people'
    #search = Search()
    #search.get_person_detail(type,name)
    userid = 'dong-yuan-18'
    user = User(userid)
    user.get_user_update()
Example #4
0
def crawler(user_info):
    url = "http://www.zhihu.com/people/" + user_info['id']

    try:
        user = User(url)

        proxy = 'None'
        #proxy = proxy_apply()
        user_agent = random.choice(user_agent_pool)

        start_time = "beginning"
        if len(user_info['activity']) != 0:
            start_time = user_info['activity'][len(user_info['activity']) - 1]

        print(
            termcolor.colored("start crawling ", "green") +
            termcolor.colored(url, "blue") + termcolor.colored(
                "\nproxy:" + proxy + '    start from ' + start_time +
                '\n', "green"))

        #爬取知乎用户动态
        for activity in user.get_activities(proxy, user_agent, start_time):
            user_info['activity'].append(activity)

        user_info['error'] = ''
        #print(url + "finished crawling")
        pass_to_writer(user_info)
        #proxy_recycle(proxy)
        amount_of_finished_users.value += 1
        print(
            termcolor.colored(
                url + ' finished    ' + str(amount_of_finished_users.value) +
                "/" + str(amount_of_users.value) + "\n", "green"))

    except ConnectionError as e:
        print(termcolor.colored("Warning: " + str(e), "yellow"))
        print("Reconnecting for " + termcolor.colored(url, "blue") + "\n")
        crawler(user_info)

    except Exception as e:
        user_info['error'] = str(e)
        pass_to_writer(user_info)

        traceback.print_exc()
        print(termcolor.colored("Error: " + str(e), "red"))
        print("skip " + termcolor.colored(url, "blue"))
        print('\n')
Example #5
0
 def __init__(self, user_uuid, layer):
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, ""),
                          user.get_followees()) if layer < 3 else []
     # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers())
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Example #6
0
 def __init__(self, user_uuid, layer):
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followees()) if layer < 3 else []
     # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers())
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
def crawl_id():
    input_path = 'data/available_users'
    output_path = 'data/user_name'

    z_id = {}

    zhihu_url = 'http://www.zhihu.com/people/'

    with open(input_path, 'r') as users:
        for user in users:
            user = eval(user)
            z_user = User(zhihu_url + user['id'])
            try:
                z_id[user['index']] = z_user.get_user_id()
                print(user['index'])
            except:
                print(user['index'], z_user[user['index']])

    with open(output_path, 'w') as out:
        out.write(str(z_id))
def crawler(user_info):
    url = "http://www.zhihu.com/people/" + user_info['id']

    try:
        user = User(url)

        proxy = 'None'
        #proxy = proxy_apply()
        user_agent = random.choice(user_agent_pool)

        start_time = "beginning"
        if len(user_info['activity']) != 0:
            start_time = user_info['activity'][len(user_info['activity']) - 1]

        print(termcolor.colored("start crawling ", "green") + termcolor.colored(url, "blue") + termcolor.colored("\nproxy:" + proxy + '    start from ' + start_time + '\n', "green"))

        #爬取知乎用户动态
        for activity in user.get_activities(proxy, user_agent, start_time):
            user_info['activity'].append(activity)

        user_info['error'] = ''
        #print(url + "finished crawling")
        pass_to_writer(user_info)
        #proxy_recycle(proxy)
        amount_of_finished_users.value += 1
        print(termcolor.colored(url + ' finished    ' + str(amount_of_finished_users.value) + "/" + str(amount_of_users.value) + "\n", "green"))

    except ConnectionError as e:
        print(termcolor.colored("Warning: "  + str(e), "yellow"))
        print("Reconnecting for " + termcolor.colored(url, "blue") + "\n")
        crawler(user_info)

    except Exception as e:
        user_info['error'] = str(e)
        pass_to_writer(user_info)

        traceback.print_exc()
        print(termcolor.colored("Error: " + str(e), "red"))
        print("skip " + termcolor.colored(url, "blue"))
        print('\n')
def main():
    viplist = []
    f1=open("susp/list2.txt")
#    print f1.name
    msg = f1.readline()
    while msg :
#        msg = f1.readline();
        print msg
        name = msg[0:len(msg)-1]
        viplist.append(name)
        msg=f1.readline()
    f1.close()
    offset = viplist.index("zhang-xiao-chuan-16", )
    for user_name in viplist[offset:]:
        user_index = viplist.index(user_name, )
        user_url = "http://www.zhihu.com/people/"+user_name
        user = User(user_url)      
        print user_url
        print user_name
        count=0
        stopcount=0
        answers = user.get_answers()
        f2=open("susp/ans2/"+str(user_index)+"-"+user_name+".txt","w")
        for answer in answers:
            qid=answer.get_question_id()
            aid=answer.get_answer_id()
            if qid>"30000000":
                stopcount=0
                count=count+1
                print count
                print qid+" "+aid
                f2.write(qid+" "+aid+"\n")
            else:
                stopcount = stopcount+1
            if stopcount>15:
                break
        f2.close()
        print "finish "+user_name
Example #10
0
 def __init__(self, user_uuid, layer):
     """
     Agrs:
         user_uuid: the unique id of the user
         layer: the number of hops to reach to this user from the seed user
     """
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(
         lambda x: x.user_url.replace(prefix_people, "").replace(
             prefix_people_http, ""),
         user.get_followees()) if layer < 3 else []
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Example #11
0
 def __init__(self, user_uuid, layer):
     """
     Agrs:
         user_uuid: the unique id of the user
         layer: the number of hops to reach to this user from the seed user
     """
     user = User(prefix_people + user_uuid)
     self.user = user
     self.uuid = user_uuid
     self.user_id = user.get_user_id()
     self.followees = map(lambda x: x.user_url.replace(prefix_people, "").replace(prefix_people_http, ""), user.get_followees()) if layer < 3 else []
     self.answer_num = user.get_answers_num()
     self.following_num = user.get_followees_num()
     self.follower_num = user.get_followers_num()
     self.upvote_num = user.get_agree_num()
     self.thank_num = user.get_thanks_num()
     self.layer = layer
Example #12
0
# Python 3.6.1

import requests.utils
import pickle
from http.cookies import SimpleCookie

from zhihu import User
from zhihu import Answer
from zhihu import Account

zhihu = User()
print(zhihu.cookies)

# 用户登录
account = Account()
result = account.login()
print(result)

# 查看用户profile 成功
# profile = zhihu.profile(user_slug="xiaoxiaodouzi")
# print(profile)

# 发送私信 成功
# response = zhihu.send_message(content='TESTMESSAGE', user_slug="xiaoxiaodouzi")
# print(response)

# 关注用户 成功
# response = zhihu.follow(user_slug='SemitLee')
# print(response)

answer = Answer(url="https://www.zhihu.com/question/34401174/answer/389502954")
Example #13
0
def main():
    # read wanted user url from users.txt
    lines = [line.rstrip('\n') for line in open("users_example.txt")]

    # get (users)
    users = [User(user_url) for user_url in lines]
    user_ids = [user.get_user_id() for user in users]
    for user_id in user_ids:
        print "user node: " + user_id

    # get (user)-[follow]->(user) relationships
    # here I use followers, since followers are usually fewer than followees
    # following = []
    # for user in users:
    #     print "processing followers of user: "******"follow relationship: " + src + " follows " + dst

    # get (user)-[answer]->(question) relationships
    answers = []
    answerings = []
    questions = defaultdict(int)
    for user in users:
        print "processing answers of user: "******"number of answers: " + str(len(answers))
    print "number of answerings: " + str(len(answerings))

    # get (user)-[ask]->(question) relationships
    asking = []
    for user in users:
        print "processing questions of user: "******"number of asking: " + str(len(asking))

    # filter by intervel
    # a question node should have at least two relaionships( answering or asking)
    questions = {k: v for (k, v) in questions.iteritems() if v > 1}
    for question, num in questions.iteritems():
        print "question: " + question + " is mentioned " + str(num) + " times."

    # prepare data for writing
    id_map = {}
    index = 0
    usersOut = []
    for user in users:
        id_map[user.get_user_id()] = index
        usero = {}
        usero['id'] = index
        usero['Year'] = index
        usero['cYear'] = index
        usero['Type'] = 'User'
        usero['label'] = user.get_user_id()
        follower_num = user.get_followers_num()
        usero['follower_num'] = follower_num
        if follower_num < 1:
            usero['size'] = 1
        else:
            usero['size'] = math.ceil(math.log(follower_num))
            usersOut.append(usero)
            index += 1

    questionOut = []
    for question, v in questions.iteritems():
        id_map[question] = index
        questiono = {}
        questiono['id'] = index
        questiono['Year'] = randint(1, index)
        questiono['cYear'] = questiono['Year']
        questiono['Type'] = 'Question'
        questiono['label'] = question
        questionOut.append(questiono)
        index += 1

    askOut = []
    for ask in asking:
        if ask[1] in questions:
            asko = {}
            asko['Edge Id'] = str(index)
            asko['target'] = id_map[ask[1]]
            asko['source'] = id_map[ask[0]]
            asko['Year'] = id_map[ask[1]]
            askOut.append(asko)
            index += 1

    answerOut = []
    for answering in answerings:
        title = answering[1].get_question().get_title()
        if title in questions:
            answero = {}
            answero['Edge Id'] = str(index)
            answero['target'] = id_map[title]
            answero['source'] = id_map[answering[0]]
            answero['Year'] = id_map[title]
            answerOut.append(answero)
            index += 1

    write_file(usersOut, questionOut, askOut, answerOut)
Example #14
0
def test():
    lines = [line.rstrip('\n') for line in open("users_example.txt")]
    for line in lines:
        u = User(line)
        print u.get_user_id()
Example #15
0
def user_spider(user_url):
    database_name = 'wjw_zhihu'
    table_name = 'user_info'

    # 设置数据库连接
    conn=pymysql.connect(host='localhost',user='******',passwd='root',port=3306)
    cur=conn.cursor()
    # 选择数据库
    conn.select_db(database_name)
    # 设置编码, 否则插入数据库乱码
    cur.execute('set names utf8')

    # 设置Redis链接, 记录爬过的user_unique
    redis_conn = redis.Redis(host='127.0.0.1', port=6379, db=0)

    # 获取当前用户信息
    user = User(user_url)
    user_unique = user.get_user_unique()
    if redis_conn.get(get_user_redis_key(user_unique)) == None:
        user_info = user.get_user_info()
        # print user_info;
        # sys.exit()

        # 将用户数据插入数据库
        try:
            insert_sql = prepare_insert_sql(table_name, user_info)
            res=cur.execute(insert_sql)
            conn.commit()       # commit之后才能真正提交到数据库
            redis_conn.set(get_user_redis_key(user_unique), 1)  #设置redis缓存, 防止重爬
            print(user_info['user_unique'] + '  ------  ' + str(res))
        except Exception as e:
            # 打印日志, 记录异常信息
            exceptMsg = str(e)
            print(exceptMsg)


    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()

    # 统计该用户关注的人
    # i = 0
    # for followee in followees:
    #     print followee.user_url
    #     print followee.get_user_id()
    #     i = i + 1
    #     if i == 41:
    #         break


    # print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        i = i + 1
        if i % 10 == 0:
            redis_conn.save()   # 将数据写回磁盘。保存时阻塞
            time.sleep(0.3)

        follower_user_unique = follower.get_user_unique() 
        if redis_conn.get(get_user_redis_key(follower_user_unique)) == None:
            try:
                follower_info = follower.get_user_info()
                follower_insert_sql = prepare_insert_sql(table_name, follower_info)
                res=cur.execute(follower_insert_sql)
                conn.commit()
                redis_conn.set(get_user_redis_key(follower_user_unique), 1)  #设置redis缓存, 防止重爬
                print(follower_info['user_unique'] + '  ------  ' + str(res))
            except Exception as e:
                # 打印日志, 记录异常信息
                exceptMsg = str(e)
                print(exceptMsg)
Example #16
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    topics = user.get_topics()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender  #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    for topic in topics:
        print topic

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Example #17
0
def test():
    lines = [line.rstrip('\n') for line in open("users_example.txt")]
    for line in lines:
        u = User(line)
        print u.get_user_id()
Example #18
0
def main():
    client = pymongo.MongoClient("localhost", 27017)
    db = client.zhihu_user
    urllist = db.urllist
    userlist = db.userlist

    origin_users = ["https://www.zhihu.com/people/jixin",
                "https://www.zhihu.com/people/zhang-jia-wei",
		"https://www.zhihu.com/people/zhu-xuan-86",
                "https://www.zhihu.com/people/kaifulee",
		"https://www.zhihu.com/people/e-miao-de-nai-ba"]

    urls = urllist.distinct("user_url")

    for u in origin_users:
        if u in urls:
            pass
        else:
            urllist.insert({"user_url": u,
                            "jlzt": "1"})

    while 1:
        item = urllist.find_one({'jlzt':'1'})
        if item == None:
            print u'已全部处理完成'
            break
        else:
            user_url = item["user_url"]
        
        starttime = datetime.datetime.now()
        urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}})
	
	try:
	    user = User(user_url)


	    zhihu_id = user.get_data_id()
	    # 用户唯一id
	    zhihu_name = user.get_user_id()
	    # 用户名
	    followees_num = user.get_followees_num()
	    # 用户关注人数
	    followers_num = user.get_followers_num()
	    # 用户关注者人数
	    gender = user.get_gender()
	    # 性别

	    # 提问数
	    asks_num = user.get_asks_num()
	    # 获取该用户回答的个数
	    answers_num = user.get_answers_num()
	    # 获取该用户收藏夹个数
	    collections_num = user.get_collections_num()
	    # 获取该用户获得的赞同数
	    agree_num = user.get_agree_num()
	    # 获取该用户获得的感谢数
	    thanks_num = user.get_thanks_num()
	    # 获取该用户的头像url
	    head_img_url = user.get_head_img_url()
	    # 关注的话题数
	    topics_num = user.get_topics_num()


	    # 获取该用户关注的人
	    followees = user.get_followees()
	    # 获取关注该用户的人
	    followers = user.get_followers()
	    # 获取提出的问题
	    questions = user.get_asks()
	    # 获取回答的问题
	    answers = user.get_answers()
	    # 获取话题
	    topics = user.get_topics()

	    print "start process " + zhihu_name + ";\n"
	    #print zhihu_id # 黄继新
	    #print zhihu_name # 614840
	    #print followees_num # 8408
	    #print followers_num # 1323
	    #print gender # 786
	    #print asks_num # 44
	    #print answers_num # 46387
	    #print collections_num # 11477
	    #print agree_num
	    #print thanks_num
	    #print head_img_url
	    #print topics_num



	    followee_l = []
	    follower_l = []
	    questions_l = []
	    answers_l = []
	    topics_l = []



	    print u'开始处理关注的人'
	    for followee in followees:
		followee_l.append(followee.user_url.split('/')[4])
		if len(followee_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..."
		time.sleep(0.05)
	    followee_list  = ','.join(followee_l)
	    print u'添加完成'

	    print u'开始添加关注者至处理队列'
	    for follower in followers:
		follower_l.append(follower.user_url.split('/')[4])
		urls = urllist.distinct("user_url")
		if follower.user_url in urls:
		    pass
		    # print "follower_url:'"+follower.user_url+"' passed"
		else:
		    urllist.insert({"user_url": follower.user_url, "jlzt": "1"})
		    #print "follower_url:'" +follower.user_url + "' added"
		if len(follower_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..."
		time.sleep(0.05)


	    print u'添加完成'

	    for q in questions:
		questions_l.append("url=" + q.url + "|title=" + q.get_title())
		time.sleep(0.01)
	    for a in answers:
		answers_l.append(a.answer_url)
		time.sleep(0.01)
	    for t in topics:
		topics_l.append(t)
		time.sleep(0.01)

	    user_data = {"zhihu_id":zhihu_id,
			"zhihu_name":zhihu_name,
			"followees_num":followees_num,
			"followers_num":followers_num,
			"followees":followee_l,
			"followers":follower_l,
			"questions":questions_l,
			"gender":gender,
			"asks_num":asks_num,
			"answers_num":answers_num,
			"ansers":answers_l,
			"collections_num":collections_num,
			"agree_num":agree_num,
			"thanks_num":thanks_num,
			"topics_num":topics_num,
			"topics":topics_l,
			"head_img_url":head_img_url
			}

	    print "user_data prepared:"

	    urls = userlist.distinct("user_url")

	    if user_url in urls:
		     pass
	    else:
		userlist.insert(user_data)
		print "user_data inserted: \n"

	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}})
	    endtime = datetime.datetime.now()
	    interval=(endtime - starttime).seconds
	    print zhihu_name + "finnished. spent " + str(interval) + "seconds."
	except:
            traceback.print_exc() 
	    time.sleep(10)
	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}})
            continue

    print "处理完毕"
Example #19
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender  #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Example #20
0
import os
import Cookie
import browsercookie
import re
import urllib2
import requests,cookielib
import json
import pickle
from zhihu import Question


from zhihu import User

user_url = "http://www.zhihu.com/people/wu.chen"
user = User(user_url)
answers = user.get_answers()

for answer in answers:
    answer.to_txt()
    answer.to_md()

#
#url = "https://www.zhihu.com/question/24269892"
#question = Question(url)
#answers = question.get_all_answers()
#for answer in answers:
#    answer.to_txt()
#    answer.to_md()

def save_obj(obj, name ):
Example #21
0
from sqlalchemy.orm import sessionmaker

if __name__ == "__main__":
    userid = "wonderful-vczh"

    # create db engine
    engine = create_engine("sqlite:///zhihu.db", echo=False)
    dbmodel.Base.metadata.create_all(engine)

    # create a session
    Session = sessionmaker(bind=engine)
    session = Session()
    url_base = "http://www.zhihu.com/people/"
    url = url_base + userid

    zhihu_user = User(url)
    username = zhihu_user.get_user_id().decode("utf8")
    #print username

    db_user = dbmodel.Dbuser(id=userid, name=username)

    # add user
    session.add(db_user)
    session.commit()

    # add answers
    for i, answer in enumerate(zhihu_user.get_answers()):
        print i
        session.add(
            dbmodel.Dbanswer(id=answer.answer_url,
                             upvote=answer.get_upvote(),
Example #22
0
        str=follower.get_work()
        print(str)
        if allWork.has_key(str):
            allWork[str]+=1
        else:
            allWork[str]=1
        print json.dumps(allWork, encoding="UTF-8", ensure_ascii=False)
    #to delete bias
    if 'unknown' in allWork:
        del allWork['unknown']

    top5Cities = dict(sorted(allWork.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
    return top5Cities

def getTop5Relations(followers):
    allFollowers={}
    for follower in followers:
        allFollowers[follower.get_user_id()]=follower.get_vote_thank_relation()
        print(allFollowers)

    superFriends = dict(sorted(allFollowers.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
    return superFriends

if __name__ == '__main__':
    #an example to get your friends' city location
    user_url = "https://www.zhihu.com/people/BravoMaooo"
    user = User(user_url)
    followers = user.get_followers()
    dics=getTop5Works(followers)
    print(dics)
    v.plotPie4Top5(dics)
Example #23
0
def grab(url, threadID):
    logging.info(url)
    user_grab = User(url)
    followees = user_grab.get_followees()

    for i, user_grab in enumerate(followees):
        user = Users()
        flag = True
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                flag = False
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
                break
        try:
            user.user_id = user_grab.get_user_id()
            user.data_id = user_grab.get_data_id()
            user.followees_num = user_grab.get_followees_num()
            user.followers_num = user_grab.get_followers_num()
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
            user.agree_num = user_grab.get_agree_num()
            user.thanks_num = user_grab.get_thanks_num()
            user.url = user_grab.get_user_url()
            user.modify_time = datetime.utcnow()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Example #24
0
def grab(url, threadID):
    print url
    user_grab = User(url)
    followers = user_grab.get_followers()

    for i, user_grab in enumerate(followers):
        user = Users()
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
        user.user_id = user_grab.get_user_id()
        user.data_id = user_grab.get_data_id()
        user.followees_num = user_grab.get_followees_num()
        user.followers_num = user_grab.get_followers_num()
        try:
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Example #25
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    topics = user.get_topics()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    for topic in topics:
        print topic

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Example #26
0
followees_num =user.get_followees_num()
asks_num = user.get_asks_num()
answers_num = user.get_answers_num()
collections_num = user.get_collections_num()
agree_num = user.get_agree_num()
thanks_num = user.get_thanks_num()
followees = user.get_followees()
followers = user.get_followers()
asks = user.get_asks()
answers = user.get_answers()
collections = user.get_collections()
"""
recorder=dict();
userque=deque();
for suser in start_url:
        tuser=User(suser);
        userque.append( tuser );
        recorder[suser[28:]]=tuser.get_user_id();
total=len(userque);
num=total;
flag=False;
#DirectedGraph="{"
while num>=1:
	num-=1;
	user=userque.popleft()
	print user.get_user_id()
	followees_num =user.get_followees_num()
	followees = user.get_followees()
	for i in range(1,followees_num+1):
		cuser=followees.next()
		# when total<=MAXTOT,then cuser will be in userque.
Example #27
0

def insert_data(user):
    time.sleep(random.random() * 10)
    topics = user.get_topics()
    for t in topics:
        ml.lock(increment, t)
        ml.unlock()
        print t


if __name__ == '__main__':
    #main()
    #user_url = 'https://www.zhihu.com/people/excited-vczh'
    user_url = 'https://www.zhihu.com/people/li-tao-40-73'
    user = User(user_url, u'李涛')
    #user = User(user_url, u'vczh')
    #print user.get_topics_num()
    #for i in user.get_topics():
    #    print i.encode('utf-8')

    #'''
    followees = user.get_followees()
    count = 0
    topics = user.get_topics()
    for t in topics:
        if t not in topics_map: topics_map[t] = 1
        else: topics_map[t] += 1
        print t
    trs = []
    for i in followees:
Example #28
0
def main():

    initial_user_url = "http://www.zhihu.com/people/BigMing"
    
    url_queue=Queue.Queue()
    url_queue.put(initial_user_url)

    save_pic_dir0=sys.path[0]+"/pic_female/"
    save_pic_dir1=sys.path[0]+"/pic_male/"
    save_pic_dir2=sys.path[0]+"/pic_emale/"

    saved_count_female=0
    saved_count_male=0
    saved_count_emale=0
    visited_url_count=0
    tried_url_count=0

    IO_error_count=0
    
    limit_count=1000000000
    count=0
    
    flag=True
    
    least_follower=1000
     
    
    while(flag):
        
        if url_queue.qsize()>0:
            current_url=url_queue.get()
            user = User(current_url)

            try:
                print current_url,
                print "     queue_size: ",
                print url_queue.qsize(),
                print "     Saved_size: ",
                print saved_count_male+saved_count_female
                followees = user.get_followees_with_condition(least_follower)
                
                for followee in followees:
                    
                    tried_url_count+=1
                    print "tried_url_count: " + str(tried_url_count)
                    
                    visited_url_count+=1
                    print "visited_url_count: " + str(visited_url_count)
                                       
                    url_queue.put(followee.user_url)

                    try:
                        req = urllib2.Request(followee.user_pic_url) 
                        res = urllib2.urlopen(followee.user_pic_url,timeout=10)
                        pic = res.read()
                        pextention = os.path.splitext(followee.user_pic_url)
                    
                        if platform.system() == 'Windows':
                            pname = followee.user_id.decode('utf-8','ignore').encode('gbk','ignore')
                        else:
                            pname=followee.user_id
                                
                        followee_count=followee.user_followers_num

                        if followee.user_gender==0:
                            p_full_path=save_pic_dir0+str(saved_count_female+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                            saved_count_female+=1
                                
                        if followee.user_gender==1 :
                            p_full_path=save_pic_dir1+str(saved_count_male+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                            saved_count_male+=1

                        if followee.user_gender==2 :
                            p_full_path=save_pic_dir2+str(saved_count_emale+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                            saved_count_emale+=1
                                
                        if followee.user_gender==3 :
                                
                            if followee.get_user_gender()==0:
                                p_full_path=save_pic_dir0+str(saved_count_female+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                                saved_count_female+=1
                                    
                            if followee.get_user_gender()==1:
                                p_full_path=save_pic_dir1+str(saved_count_male+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                                saved_count_male+=1
                                    
                            if followee.get_user_gender()==2:
                                p_full_path=save_pic_dir2+str(saved_count_emale+1)+"_"+pname+"_"+str(followee_count)+pextention[1]
                                saved_count_emale+=1
                                
                        p = open(p_full_path, "wb");
                        p.write(pic)
                        p.close()
                            
                        count+=1
                        print "female: "+str(saved_count_female)+"  "+"male: "+str(saved_count_male)+"  "+"emale: "+str(saved_count_emale)
                        if count>limit_count:
                            flag=False
                            break  
                    except:
                        IO_error_count+=1;
                        print "IO error"                    
                print " "              
            except:
                print "why????????????????????"
        else:
            break
Example #29
0
    user_collections = user.get_collections()
    for collection in user_collections:
        # 输出每一个收藏夹的名字
        print collection.get_name()
        # 得到该收藏夹下的前十个回答
        top_answers = collection.get_top_i_answers(10)
        # 把答案内容转成txt,markdown
        for answer in top_answers:
            answer.to_txt()
            answer.to_md()


def main():
    url = "http://www.zhihu.com/question/24269892"
    question_test(url)
    answer_url = "http://www.zhihu.com/question/24269892/answer/29960616"
    answer_test(answer_url)
    user_url = "http://www.zhihu.com/people/jixin"
    user_test(user_url)
    collection_url = "http://www.zhihu.com/collection/36750683"
    collection_test(collection_url)
    test()


if __name__ == '__main__':
    user_url = "https://www.zhihu.com/people/BravoMaooo"
    user = User(user_url)
    followers = user.get_followers()
    for follower in followers:
        print(follower.get_user_id())
Example #30
0
# -*- coding: utf-8 -*-
from zhihu import User

user_url = "http://www.zhihu.com/people/jixin"
user = User(user_url)
# 获取用户ID
user_id = user.get_user_id()
# 获取该用户的关注者人数
followers_num = user.get_followers_num()
# 获取该用户关注的人数
followees_num =user.get_followees_num()
# 获取该用户提问的个数
asks_num = user.get_asks_num()
# 获取该用户回答的个数
answers_num = user.get_answers_num()
# 获取该用户收藏夹个数
collections_num = user.get_collections_num()
# 获取该用户获得的赞同数
agree_num = user.get_agree_num()
# 获取该用户获得的感谢数
thanks_num = user.get_thanks_num()

# 获取该用户关注的人
followees = user.get_followees()
# 获取关注该用户的人
followers = user.get_followers()
# 获取该用户提的问题
asks = user.get_asks()
# 获取该用户回答的问题的答案
answers = user.get_answers()
# 获取该用户的收藏夹
Example #31
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num =user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id # 黄继新
    print followers_num # 614840
    print followees_num # 8408
    print asks_num # 1323
    print answers_num # 786
    print collections_num # 44
    print agree_num # 46387
    print thanks_num # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Example #32
0
def main():

    initial_user_url = "http://www.zhihu.com/people/BigMing"

    url_queue = Queue.Queue()
    url_queue.put(initial_user_url)

    save_pic_dir0 = sys.path[0] + "/pic_female/"
    save_pic_dir1 = sys.path[0] + "/pic_male/"
    save_pic_dir2 = sys.path[0] + "/pic_emale/"

    saved_count_female = 0
    saved_count_male = 0
    saved_count_emale = 0
    visited_url_count = 0
    tried_url_count = 0

    IO_error_count = 0

    limit_count = 1000000000
    count = 0

    flag = True

    least_follower = 1000

    while (flag):

        if url_queue.qsize() > 0:
            current_url = url_queue.get()
            user = User(current_url)

            try:
                print current_url,
                print "     queue_size: ",
                print url_queue.qsize(),
                print "     Saved_size: ",
                print saved_count_male + saved_count_female
                followees = user.get_followees_with_condition(least_follower)

                for followee in followees:

                    tried_url_count += 1
                    print "tried_url_count: " + str(tried_url_count)

                    visited_url_count += 1
                    print "visited_url_count: " + str(visited_url_count)

                    url_queue.put(followee.user_url)

                    try:
                        req = urllib2.Request(followee.user_pic_url)
                        res = urllib2.urlopen(followee.user_pic_url,
                                              timeout=10)
                        pic = res.read()
                        pextention = os.path.splitext(followee.user_pic_url)

                        if platform.system() == 'Windows':
                            pname = followee.user_id.decode(
                                'utf-8', 'ignore').encode('gbk', 'ignore')
                        else:
                            pname = followee.user_id

                        followee_count = followee.user_followers_num

                        if followee.user_gender == 0:
                            p_full_path = save_pic_dir0 + str(
                                saved_count_female +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_female += 1

                        if followee.user_gender == 1:
                            p_full_path = save_pic_dir1 + str(
                                saved_count_male +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_male += 1

                        if followee.user_gender == 2:
                            p_full_path = save_pic_dir2 + str(
                                saved_count_emale +
                                1) + "_" + pname + "_" + str(
                                    followee_count) + pextention[1]
                            saved_count_emale += 1

                        if followee.user_gender == 3:

                            if followee.get_user_gender() == 0:
                                p_full_path = save_pic_dir0 + str(
                                    saved_count_female +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_female += 1

                            if followee.get_user_gender() == 1:
                                p_full_path = save_pic_dir1 + str(
                                    saved_count_male +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_male += 1

                            if followee.get_user_gender() == 2:
                                p_full_path = save_pic_dir2 + str(
                                    saved_count_emale +
                                    1) + "_" + pname + "_" + str(
                                        followee_count) + pextention[1]
                                saved_count_emale += 1

                        p = open(p_full_path, "wb")
                        p.write(pic)
                        p.close()

                        count += 1
                        print "female: " + str(
                            saved_count_female) + "  " + "male: " + str(
                                saved_count_male) + "  " + "emale: " + str(
                                    saved_count_emale)
                        if count > limit_count:
                            flag = False
                            break
                    except:
                        IO_error_count += 1
                        print "IO error"
                print " "
            except:
                print "why????????????????????"
        else:
            break