def user_test(user_url): user = User(user_url) user_id = user.get_user_id() answers = user.get_answers() print answers for answer in answers: print answer.get_question().get_question_id()
def main(): name = 'xxx' type = 'people' #search = Search() #search.get_person_detail(type,name) userid = 'dong-yuan-18' user = User(userid) user.get_user_update()
def main(): name = 'xxx' type ='people' #search = Search() #search.get_person_detail(type,name) userid = 'dong-yuan-18' user = User(userid) user.get_user_update()
def crawler(user_info): url = "http://www.zhihu.com/people/" + user_info['id'] try: user = User(url) proxy = 'None' #proxy = proxy_apply() user_agent = random.choice(user_agent_pool) start_time = "beginning" if len(user_info['activity']) != 0: start_time = user_info['activity'][len(user_info['activity']) - 1] print( termcolor.colored("start crawling ", "green") + termcolor.colored(url, "blue") + termcolor.colored( "\nproxy:" + proxy + ' start from ' + start_time + '\n', "green")) #爬取知乎用户动态 for activity in user.get_activities(proxy, user_agent, start_time): user_info['activity'].append(activity) user_info['error'] = '' #print(url + "finished crawling") pass_to_writer(user_info) #proxy_recycle(proxy) amount_of_finished_users.value += 1 print( termcolor.colored( url + ' finished ' + str(amount_of_finished_users.value) + "/" + str(amount_of_users.value) + "\n", "green")) except ConnectionError as e: print(termcolor.colored("Warning: " + str(e), "yellow")) print("Reconnecting for " + termcolor.colored(url, "blue") + "\n") crawler(user_info) except Exception as e: user_info['error'] = str(e) pass_to_writer(user_info) traceback.print_exc() print(termcolor.colored("Error: " + str(e), "red")) print("skip " + termcolor.colored(url, "blue")) print('\n')
def __init__(self, user_uuid, layer): user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followees()) if layer < 3 else [] # self.followers = map(lambda x: x.user_url.replace(prefix_people, ""), user.get_followers()) self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
def crawl_id(): input_path = 'data/available_users' output_path = 'data/user_name' z_id = {} zhihu_url = 'http://www.zhihu.com/people/' with open(input_path, 'r') as users: for user in users: user = eval(user) z_user = User(zhihu_url + user['id']) try: z_id[user['index']] = z_user.get_user_id() print(user['index']) except: print(user['index'], z_user[user['index']]) with open(output_path, 'w') as out: out.write(str(z_id))
def crawler(user_info): url = "http://www.zhihu.com/people/" + user_info['id'] try: user = User(url) proxy = 'None' #proxy = proxy_apply() user_agent = random.choice(user_agent_pool) start_time = "beginning" if len(user_info['activity']) != 0: start_time = user_info['activity'][len(user_info['activity']) - 1] print(termcolor.colored("start crawling ", "green") + termcolor.colored(url, "blue") + termcolor.colored("\nproxy:" + proxy + ' start from ' + start_time + '\n', "green")) #爬取知乎用户动态 for activity in user.get_activities(proxy, user_agent, start_time): user_info['activity'].append(activity) user_info['error'] = '' #print(url + "finished crawling") pass_to_writer(user_info) #proxy_recycle(proxy) amount_of_finished_users.value += 1 print(termcolor.colored(url + ' finished ' + str(amount_of_finished_users.value) + "/" + str(amount_of_users.value) + "\n", "green")) except ConnectionError as e: print(termcolor.colored("Warning: " + str(e), "yellow")) print("Reconnecting for " + termcolor.colored(url, "blue") + "\n") crawler(user_info) except Exception as e: user_info['error'] = str(e) pass_to_writer(user_info) traceback.print_exc() print(termcolor.colored("Error: " + str(e), "red")) print("skip " + termcolor.colored(url, "blue")) print('\n')
def main(): viplist = [] f1=open("susp/list2.txt") # print f1.name msg = f1.readline() while msg : # msg = f1.readline(); print msg name = msg[0:len(msg)-1] viplist.append(name) msg=f1.readline() f1.close() offset = viplist.index("zhang-xiao-chuan-16", ) for user_name in viplist[offset:]: user_index = viplist.index(user_name, ) user_url = "http://www.zhihu.com/people/"+user_name user = User(user_url) print user_url print user_name count=0 stopcount=0 answers = user.get_answers() f2=open("susp/ans2/"+str(user_index)+"-"+user_name+".txt","w") for answer in answers: qid=answer.get_question_id() aid=answer.get_answer_id() if qid>"30000000": stopcount=0 count=count+1 print count print qid+" "+aid f2.write(qid+" "+aid+"\n") else: stopcount = stopcount+1 if stopcount>15: break f2.close() print "finish "+user_name
def __init__(self, user_uuid, layer): """ Agrs: user_uuid: the unique id of the user layer: the number of hops to reach to this user from the seed user """ user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map( lambda x: x.user_url.replace(prefix_people, "").replace( prefix_people_http, ""), user.get_followees()) if layer < 3 else [] self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
def __init__(self, user_uuid, layer): """ Agrs: user_uuid: the unique id of the user layer: the number of hops to reach to this user from the seed user """ user = User(prefix_people + user_uuid) self.user = user self.uuid = user_uuid self.user_id = user.get_user_id() self.followees = map(lambda x: x.user_url.replace(prefix_people, "").replace(prefix_people_http, ""), user.get_followees()) if layer < 3 else [] self.answer_num = user.get_answers_num() self.following_num = user.get_followees_num() self.follower_num = user.get_followers_num() self.upvote_num = user.get_agree_num() self.thank_num = user.get_thanks_num() self.layer = layer
# Python 3.6.1 import requests.utils import pickle from http.cookies import SimpleCookie from zhihu import User from zhihu import Answer from zhihu import Account zhihu = User() print(zhihu.cookies) # 用户登录 account = Account() result = account.login() print(result) # 查看用户profile 成功 # profile = zhihu.profile(user_slug="xiaoxiaodouzi") # print(profile) # 发送私信 成功 # response = zhihu.send_message(content='TESTMESSAGE', user_slug="xiaoxiaodouzi") # print(response) # 关注用户 成功 # response = zhihu.follow(user_slug='SemitLee') # print(response) answer = Answer(url="https://www.zhihu.com/question/34401174/answer/389502954")
def main(): # read wanted user url from users.txt lines = [line.rstrip('\n') for line in open("users_example.txt")] # get (users) users = [User(user_url) for user_url in lines] user_ids = [user.get_user_id() for user in users] for user_id in user_ids: print "user node: " + user_id # get (user)-[follow]->(user) relationships # here I use followers, since followers are usually fewer than followees # following = [] # for user in users: # print "processing followers of user: "******"follow relationship: " + src + " follows " + dst # get (user)-[answer]->(question) relationships answers = [] answerings = [] questions = defaultdict(int) for user in users: print "processing answers of user: "******"number of answers: " + str(len(answers)) print "number of answerings: " + str(len(answerings)) # get (user)-[ask]->(question) relationships asking = [] for user in users: print "processing questions of user: "******"number of asking: " + str(len(asking)) # filter by intervel # a question node should have at least two relaionships( answering or asking) questions = {k: v for (k, v) in questions.iteritems() if v > 1} for question, num in questions.iteritems(): print "question: " + question + " is mentioned " + str(num) + " times." # prepare data for writing id_map = {} index = 0 usersOut = [] for user in users: id_map[user.get_user_id()] = index usero = {} usero['id'] = index usero['Year'] = index usero['cYear'] = index usero['Type'] = 'User' usero['label'] = user.get_user_id() follower_num = user.get_followers_num() usero['follower_num'] = follower_num if follower_num < 1: usero['size'] = 1 else: usero['size'] = math.ceil(math.log(follower_num)) usersOut.append(usero) index += 1 questionOut = [] for question, v in questions.iteritems(): id_map[question] = index questiono = {} questiono['id'] = index questiono['Year'] = randint(1, index) questiono['cYear'] = questiono['Year'] questiono['Type'] = 'Question' questiono['label'] = question questionOut.append(questiono) index += 1 askOut = [] for ask in asking: if ask[1] in questions: asko = {} asko['Edge Id'] = str(index) asko['target'] = id_map[ask[1]] asko['source'] = id_map[ask[0]] asko['Year'] = id_map[ask[1]] askOut.append(asko) index += 1 answerOut = [] for answering in answerings: title = answering[1].get_question().get_title() if title in questions: answero = {} answero['Edge Id'] = str(index) answero['target'] = id_map[title] answero['source'] = id_map[answering[0]] answero['Year'] = id_map[title] answerOut.append(answero) index += 1 write_file(usersOut, questionOut, askOut, answerOut)
def test(): lines = [line.rstrip('\n') for line in open("users_example.txt")] for line in lines: u = User(line) print u.get_user_id()
def user_spider(user_url): database_name = 'wjw_zhihu' table_name = 'user_info' # 设置数据库连接 conn=pymysql.connect(host='localhost',user='******',passwd='root',port=3306) cur=conn.cursor() # 选择数据库 conn.select_db(database_name) # 设置编码, 否则插入数据库乱码 cur.execute('set names utf8') # 设置Redis链接, 记录爬过的user_unique redis_conn = redis.Redis(host='127.0.0.1', port=6379, db=0) # 获取当前用户信息 user = User(user_url) user_unique = user.get_user_unique() if redis_conn.get(get_user_redis_key(user_unique)) == None: user_info = user.get_user_info() # print user_info; # sys.exit() # 将用户数据插入数据库 try: insert_sql = prepare_insert_sql(table_name, user_info) res=cur.execute(insert_sql) conn.commit() # commit之后才能真正提交到数据库 redis_conn.set(get_user_redis_key(user_unique), 1) #设置redis缓存, 防止重爬 print(user_info['user_unique'] + ' ------ ' + str(res)) except Exception as e: # 打印日志, 记录异常信息 exceptMsg = str(e) print(exceptMsg) # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 统计该用户关注的人 # i = 0 # for followee in followees: # print followee.user_url # print followee.get_user_id() # i = i + 1 # if i == 41: # break # print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: i = i + 1 if i % 10 == 0: redis_conn.save() # 将数据写回磁盘。保存时阻塞 time.sleep(0.3) follower_user_unique = follower.get_user_unique() if redis_conn.get(get_user_redis_key(follower_user_unique)) == None: try: follower_info = follower.get_user_info() follower_insert_sql = prepare_insert_sql(table_name, follower_info) res=cur.execute(follower_insert_sql) conn.commit() redis_conn.set(get_user_redis_key(follower_user_unique), 1) #设置redis缓存, 防止重爬 print(follower_info['user_unique'] + ' ------ ' + str(res)) except Exception as e: # 打印日志, 记录异常信息 exceptMsg = str(e) print(exceptMsg)
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取用户性别 user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户的头像url head_img_url = user.get_head_img_url() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() topics = user.get_topics() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print user_gender #male print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print head_img_url # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break for topic in topics: print topic print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
def main(): client = pymongo.MongoClient("localhost", 27017) db = client.zhihu_user urllist = db.urllist userlist = db.userlist origin_users = ["https://www.zhihu.com/people/jixin", "https://www.zhihu.com/people/zhang-jia-wei", "https://www.zhihu.com/people/zhu-xuan-86", "https://www.zhihu.com/people/kaifulee", "https://www.zhihu.com/people/e-miao-de-nai-ba"] urls = urllist.distinct("user_url") for u in origin_users: if u in urls: pass else: urllist.insert({"user_url": u, "jlzt": "1"}) while 1: item = urllist.find_one({'jlzt':'1'}) if item == None: print u'已全部处理完成' break else: user_url = item["user_url"] starttime = datetime.datetime.now() urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}}) try: user = User(user_url) zhihu_id = user.get_data_id() # 用户唯一id zhihu_name = user.get_user_id() # 用户名 followees_num = user.get_followees_num() # 用户关注人数 followers_num = user.get_followers_num() # 用户关注者人数 gender = user.get_gender() # 性别 # 提问数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户的头像url head_img_url = user.get_head_img_url() # 关注的话题数 topics_num = user.get_topics_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取提出的问题 questions = user.get_asks() # 获取回答的问题 answers = user.get_answers() # 获取话题 topics = user.get_topics() print "start process " + zhihu_name + ";\n" #print zhihu_id # 黄继新 #print zhihu_name # 614840 #print followees_num # 8408 #print followers_num # 1323 #print gender # 786 #print asks_num # 44 #print answers_num # 46387 #print collections_num # 11477 #print agree_num #print thanks_num #print head_img_url #print topics_num followee_l = [] follower_l = [] questions_l = [] answers_l = [] topics_l = [] print u'开始处理关注的人' for followee in followees: followee_l.append(followee.user_url.split('/')[4]) if len(followee_l) % 100 == 0: print zhihu_name + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..." time.sleep(0.05) followee_list = ','.join(followee_l) print u'添加完成' print u'开始添加关注者至处理队列' for follower in followers: follower_l.append(follower.user_url.split('/')[4]) urls = urllist.distinct("user_url") if follower.user_url in urls: pass # print "follower_url:'"+follower.user_url+"' passed" else: urllist.insert({"user_url": follower.user_url, "jlzt": "1"}) #print "follower_url:'" +follower.user_url + "' added" if len(follower_l) % 100 == 0: print zhihu_name + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..." time.sleep(0.05) print u'添加完成' for q in questions: questions_l.append("url=" + q.url + "|title=" + q.get_title()) time.sleep(0.01) for a in answers: answers_l.append(a.answer_url) time.sleep(0.01) for t in topics: topics_l.append(t) time.sleep(0.01) user_data = {"zhihu_id":zhihu_id, "zhihu_name":zhihu_name, "followees_num":followees_num, "followers_num":followers_num, "followees":followee_l, "followers":follower_l, "questions":questions_l, "gender":gender, "asks_num":asks_num, "answers_num":answers_num, "ansers":answers_l, "collections_num":collections_num, "agree_num":agree_num, "thanks_num":thanks_num, "topics_num":topics_num, "topics":topics_l, "head_img_url":head_img_url } print "user_data prepared:" urls = userlist.distinct("user_url") if user_url in urls: pass else: userlist.insert(user_data) print "user_data inserted: \n" urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}}) endtime = datetime.datetime.now() interval=(endtime - starttime).seconds print zhihu_name + "finnished. spent " + str(interval) + "seconds." except: traceback.print_exc() time.sleep(10) urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}}) continue print "处理完毕"
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取用户性别 user_gender = user.get_gender() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num = user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print user_gender #male print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
import os import Cookie import browsercookie import re import urllib2 import requests,cookielib import json import pickle from zhihu import Question from zhihu import User user_url = "http://www.zhihu.com/people/wu.chen" user = User(user_url) answers = user.get_answers() for answer in answers: answer.to_txt() answer.to_md() # #url = "https://www.zhihu.com/question/24269892" #question = Question(url) #answers = question.get_all_answers() #for answer in answers: # answer.to_txt() # answer.to_md() def save_obj(obj, name ):
from sqlalchemy.orm import sessionmaker if __name__ == "__main__": userid = "wonderful-vczh" # create db engine engine = create_engine("sqlite:///zhihu.db", echo=False) dbmodel.Base.metadata.create_all(engine) # create a session Session = sessionmaker(bind=engine) session = Session() url_base = "http://www.zhihu.com/people/" url = url_base + userid zhihu_user = User(url) username = zhihu_user.get_user_id().decode("utf8") #print username db_user = dbmodel.Dbuser(id=userid, name=username) # add user session.add(db_user) session.commit() # add answers for i, answer in enumerate(zhihu_user.get_answers()): print i session.add( dbmodel.Dbanswer(id=answer.answer_url, upvote=answer.get_upvote(),
str=follower.get_work() print(str) if allWork.has_key(str): allWork[str]+=1 else: allWork[str]=1 print json.dumps(allWork, encoding="UTF-8", ensure_ascii=False) #to delete bias if 'unknown' in allWork: del allWork['unknown'] top5Cities = dict(sorted(allWork.iteritems(), key=operator.itemgetter(1), reverse=True)[:5]) return top5Cities def getTop5Relations(followers): allFollowers={} for follower in followers: allFollowers[follower.get_user_id()]=follower.get_vote_thank_relation() print(allFollowers) superFriends = dict(sorted(allFollowers.iteritems(), key=operator.itemgetter(1), reverse=True)[:5]) return superFriends if __name__ == '__main__': #an example to get your friends' city location user_url = "https://www.zhihu.com/people/BravoMaooo" user = User(user_url) followers = user.get_followers() dics=getTop5Works(followers) print(dics) v.plotPie4Top5(dics)
def grab(url, threadID): logging.info(url) user_grab = User(url) followees = user_grab.get_followees() for i, user_grab in enumerate(followees): user = Users() flag = True while True: try: if Users.objects(data_id = user_grab.get_data_id()).count(): user = Users.objects(data_id = user_grab.get_data_id()).first() break except Exception, e: flag = False logging.error("========error1") logging.error(e) time.sleep(300) break try: user.user_id = user_grab.get_user_id() user.data_id = user_grab.get_data_id() user.followees_num = user_grab.get_followees_num() user.followers_num = user_grab.get_followers_num() user.asks_num = user_grab.get_asks_num() user.answers_num = user_grab.get_answers_num() user.collections_num = user_grab.get_collections_num() user.agree_num = user_grab.get_agree_num() user.thanks_num = user_grab.get_thanks_num() user.url = user_grab.get_user_url() user.modify_time = datetime.utcnow() except Exception, e: logging.error("========error2") logging.error(e) logging.debug(user_grab.get_user_url())
def grab(url, threadID): print url user_grab = User(url) followers = user_grab.get_followers() for i, user_grab in enumerate(followers): user = Users() while True: try: if Users.objects(data_id = user_grab.get_data_id()).count(): user = Users.objects(data_id = user_grab.get_data_id()).first() break except Exception, e: logging.error("========error1") logging.error(e) time.sleep(300) user.user_id = user_grab.get_user_id() user.data_id = user_grab.get_data_id() user.followees_num = user_grab.get_followees_num() user.followers_num = user_grab.get_followers_num() try: user.asks_num = user_grab.get_asks_num() user.answers_num = user_grab.get_answers_num() user.collections_num = user_grab.get_collections_num() except Exception, e: logging.error("========error2") logging.error(e) logging.debug(user_grab.get_user_url())
followees_num =user.get_followees_num() asks_num = user.get_asks_num() answers_num = user.get_answers_num() collections_num = user.get_collections_num() agree_num = user.get_agree_num() thanks_num = user.get_thanks_num() followees = user.get_followees() followers = user.get_followers() asks = user.get_asks() answers = user.get_answers() collections = user.get_collections() """ recorder=dict(); userque=deque(); for suser in start_url: tuser=User(suser); userque.append( tuser ); recorder[suser[28:]]=tuser.get_user_id(); total=len(userque); num=total; flag=False; #DirectedGraph="{" while num>=1: num-=1; user=userque.popleft() print user.get_user_id() followees_num =user.get_followees_num() followees = user.get_followees() for i in range(1,followees_num+1): cuser=followees.next() # when total<=MAXTOT,then cuser will be in userque.
def insert_data(user): time.sleep(random.random() * 10) topics = user.get_topics() for t in topics: ml.lock(increment, t) ml.unlock() print t if __name__ == '__main__': #main() #user_url = 'https://www.zhihu.com/people/excited-vczh' user_url = 'https://www.zhihu.com/people/li-tao-40-73' user = User(user_url, u'李涛') #user = User(user_url, u'vczh') #print user.get_topics_num() #for i in user.get_topics(): # print i.encode('utf-8') #''' followees = user.get_followees() count = 0 topics = user.get_topics() for t in topics: if t not in topics_map: topics_map[t] = 1 else: topics_map[t] += 1 print t trs = [] for i in followees:
def main(): initial_user_url = "http://www.zhihu.com/people/BigMing" url_queue=Queue.Queue() url_queue.put(initial_user_url) save_pic_dir0=sys.path[0]+"/pic_female/" save_pic_dir1=sys.path[0]+"/pic_male/" save_pic_dir2=sys.path[0]+"/pic_emale/" saved_count_female=0 saved_count_male=0 saved_count_emale=0 visited_url_count=0 tried_url_count=0 IO_error_count=0 limit_count=1000000000 count=0 flag=True least_follower=1000 while(flag): if url_queue.qsize()>0: current_url=url_queue.get() user = User(current_url) try: print current_url, print " queue_size: ", print url_queue.qsize(), print " Saved_size: ", print saved_count_male+saved_count_female followees = user.get_followees_with_condition(least_follower) for followee in followees: tried_url_count+=1 print "tried_url_count: " + str(tried_url_count) visited_url_count+=1 print "visited_url_count: " + str(visited_url_count) url_queue.put(followee.user_url) try: req = urllib2.Request(followee.user_pic_url) res = urllib2.urlopen(followee.user_pic_url,timeout=10) pic = res.read() pextention = os.path.splitext(followee.user_pic_url) if platform.system() == 'Windows': pname = followee.user_id.decode('utf-8','ignore').encode('gbk','ignore') else: pname=followee.user_id followee_count=followee.user_followers_num if followee.user_gender==0: p_full_path=save_pic_dir0+str(saved_count_female+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_female+=1 if followee.user_gender==1 : p_full_path=save_pic_dir1+str(saved_count_male+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_male+=1 if followee.user_gender==2 : p_full_path=save_pic_dir2+str(saved_count_emale+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_emale+=1 if followee.user_gender==3 : if followee.get_user_gender()==0: p_full_path=save_pic_dir0+str(saved_count_female+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_female+=1 if followee.get_user_gender()==1: p_full_path=save_pic_dir1+str(saved_count_male+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_male+=1 if followee.get_user_gender()==2: p_full_path=save_pic_dir2+str(saved_count_emale+1)+"_"+pname+"_"+str(followee_count)+pextention[1] saved_count_emale+=1 p = open(p_full_path, "wb"); p.write(pic) p.close() count+=1 print "female: "+str(saved_count_female)+" "+"male: "+str(saved_count_male)+" "+"emale: "+str(saved_count_emale) if count>limit_count: flag=False break except: IO_error_count+=1; print "IO error" print " " except: print "why????????????????????" else: break
user_collections = user.get_collections() for collection in user_collections: # 输出每一个收藏夹的名字 print collection.get_name() # 得到该收藏夹下的前十个回答 top_answers = collection.get_top_i_answers(10) # 把答案内容转成txt,markdown for answer in top_answers: answer.to_txt() answer.to_md() def main(): url = "http://www.zhihu.com/question/24269892" question_test(url) answer_url = "http://www.zhihu.com/question/24269892/answer/29960616" answer_test(answer_url) user_url = "http://www.zhihu.com/people/jixin" user_test(user_url) collection_url = "http://www.zhihu.com/collection/36750683" collection_test(collection_url) test() if __name__ == '__main__': user_url = "https://www.zhihu.com/people/BravoMaooo" user = User(user_url) followers = user.get_followers() for follower in followers: print(follower.get_user_id())
# -*- coding: utf-8 -*- from zhihu import User user_url = "http://www.zhihu.com/people/jixin" user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num =user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹
def user_test(user_url): user = User(user_url) # 获取用户ID user_id = user.get_user_id() # 获取该用户的关注者人数 followers_num = user.get_followers_num() # 获取该用户关注的人数 followees_num =user.get_followees_num() # 获取该用户提问的个数 asks_num = user.get_asks_num() # 获取该用户回答的个数 answers_num = user.get_answers_num() # 获取该用户收藏夹个数 collections_num = user.get_collections_num() # 获取该用户获得的赞同数 agree_num = user.get_agree_num() # 获取该用户获得的感谢数 thanks_num = user.get_thanks_num() # 获取该用户关注的人 followees = user.get_followees() # 获取关注该用户的人 followers = user.get_followers() # 获取该用户提的问题 asks = user.get_asks() # 获取该用户回答的问题的答案 answers = user.get_answers() # 获取该用户的收藏夹 collections = user.get_collections() print user_id # 黄继新 print followers_num # 614840 print followees_num # 8408 print asks_num # 1323 print answers_num # 786 print collections_num # 44 print agree_num # 46387 print thanks_num # 11477 print followees # <generator object get_followee at 0x7ffcac3af050> # 代表所有该用户关注的人的生成器对象 i = 0 for followee in followees: print followee.get_user_id() i = i + 1 if i == 41: break print followers # <generator object get_follower at 0x7ffcac3af0f0> # 代表所有关注该用户的人的生成器对象 i = 0 for follower in followers: print follower.get_user_id() i = i + 1 if i == 41: break print asks # <generator object get_ask at 0x7ffcab9db780> # 代表该用户提的所有问题的生成器对象 print answers # <generator object get_answer at 0x7ffcab9db7d0> # 代表该用户回答的所有问题的答案的生成器对象 print collections
def main(): initial_user_url = "http://www.zhihu.com/people/BigMing" url_queue = Queue.Queue() url_queue.put(initial_user_url) save_pic_dir0 = sys.path[0] + "/pic_female/" save_pic_dir1 = sys.path[0] + "/pic_male/" save_pic_dir2 = sys.path[0] + "/pic_emale/" saved_count_female = 0 saved_count_male = 0 saved_count_emale = 0 visited_url_count = 0 tried_url_count = 0 IO_error_count = 0 limit_count = 1000000000 count = 0 flag = True least_follower = 1000 while (flag): if url_queue.qsize() > 0: current_url = url_queue.get() user = User(current_url) try: print current_url, print " queue_size: ", print url_queue.qsize(), print " Saved_size: ", print saved_count_male + saved_count_female followees = user.get_followees_with_condition(least_follower) for followee in followees: tried_url_count += 1 print "tried_url_count: " + str(tried_url_count) visited_url_count += 1 print "visited_url_count: " + str(visited_url_count) url_queue.put(followee.user_url) try: req = urllib2.Request(followee.user_pic_url) res = urllib2.urlopen(followee.user_pic_url, timeout=10) pic = res.read() pextention = os.path.splitext(followee.user_pic_url) if platform.system() == 'Windows': pname = followee.user_id.decode( 'utf-8', 'ignore').encode('gbk', 'ignore') else: pname = followee.user_id followee_count = followee.user_followers_num if followee.user_gender == 0: p_full_path = save_pic_dir0 + str( saved_count_female + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_female += 1 if followee.user_gender == 1: p_full_path = save_pic_dir1 + str( saved_count_male + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_male += 1 if followee.user_gender == 2: p_full_path = save_pic_dir2 + str( saved_count_emale + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_emale += 1 if followee.user_gender == 3: if followee.get_user_gender() == 0: p_full_path = save_pic_dir0 + str( saved_count_female + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_female += 1 if followee.get_user_gender() == 1: p_full_path = save_pic_dir1 + str( saved_count_male + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_male += 1 if followee.get_user_gender() == 2: p_full_path = save_pic_dir2 + str( saved_count_emale + 1) + "_" + pname + "_" + str( followee_count) + pextention[1] saved_count_emale += 1 p = open(p_full_path, "wb") p.write(pic) p.close() count += 1 print "female: " + str( saved_count_female) + " " + "male: " + str( saved_count_male) + " " + "emale: " + str( saved_count_emale) if count > limit_count: flag = False break except: IO_error_count += 1 print "IO error" print " " except: print "why????????????????????" else: break