Beispiel #1
0
def grab(url, threadID):
    print url
    user_grab = User(url)
    followers = user_grab.get_followers()

    for i, user_grab in enumerate(followers):
        user = Users()
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
        user.user_id = user_grab.get_user_id()
        user.data_id = user_grab.get_data_id()
        user.followees_num = user_grab.get_followees_num()
        user.followers_num = user_grab.get_followers_num()
        try:
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Beispiel #2
0
def grab(url, threadID):
    logging.info(url)
    user_grab = User(url)
    followees = user_grab.get_followees()

    for i, user_grab in enumerate(followees):
        user = Users()
        flag = True
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                flag = False
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
                break
        try:
            user.user_id = user_grab.get_user_id()
            user.data_id = user_grab.get_data_id()
            user.followees_num = user_grab.get_followees_num()
            user.followers_num = user_grab.get_followers_num()
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
            user.agree_num = user_grab.get_agree_num()
            user.thanks_num = user_grab.get_thanks_num()
            user.url = user_grab.get_user_url()
            user.modify_time = datetime.utcnow()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Beispiel #3
0
def main():
    client = pymongo.MongoClient("localhost", 27017)
    db = client.zhihu_user
    urllist = db.urllist
    userlist = db.userlist

    origin_users = ["https://www.zhihu.com/people/jixin",
                "https://www.zhihu.com/people/zhang-jia-wei",
		"https://www.zhihu.com/people/zhu-xuan-86",
                "https://www.zhihu.com/people/kaifulee",
		"https://www.zhihu.com/people/e-miao-de-nai-ba"]

    urls = urllist.distinct("user_url")

    for u in origin_users:
        if u in urls:
            pass
        else:
            urllist.insert({"user_url": u,
                            "jlzt": "1"})

    while 1:
        item = urllist.find_one({'jlzt':'1'})
        if item == None:
            print u'已全部处理完成'
            break
        else:
            user_url = item["user_url"]
        
        starttime = datetime.datetime.now()
        urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}})
	
	try:
	    user = User(user_url)


	    zhihu_id = user.get_data_id()
	    # 用户唯一id
	    zhihu_name = user.get_user_id()
	    # 用户名
	    followees_num = user.get_followees_num()
	    # 用户关注人数
	    followers_num = user.get_followers_num()
	    # 用户关注者人数
	    gender = user.get_gender()
	    # 性别

	    # 提问数
	    asks_num = user.get_asks_num()
	    # 获取该用户回答的个数
	    answers_num = user.get_answers_num()
	    # 获取该用户收藏夹个数
	    collections_num = user.get_collections_num()
	    # 获取该用户获得的赞同数
	    agree_num = user.get_agree_num()
	    # 获取该用户获得的感谢数
	    thanks_num = user.get_thanks_num()
	    # 获取该用户的头像url
	    head_img_url = user.get_head_img_url()
	    # 关注的话题数
	    topics_num = user.get_topics_num()


	    # 获取该用户关注的人
	    followees = user.get_followees()
	    # 获取关注该用户的人
	    followers = user.get_followers()
	    # 获取提出的问题
	    questions = user.get_asks()
	    # 获取回答的问题
	    answers = user.get_answers()
	    # 获取话题
	    topics = user.get_topics()

	    print "start process " + zhihu_name + ";\n"
	    #print zhihu_id # 黄继新
	    #print zhihu_name # 614840
	    #print followees_num # 8408
	    #print followers_num # 1323
	    #print gender # 786
	    #print asks_num # 44
	    #print answers_num # 46387
	    #print collections_num # 11477
	    #print agree_num
	    #print thanks_num
	    #print head_img_url
	    #print topics_num



	    followee_l = []
	    follower_l = []
	    questions_l = []
	    answers_l = []
	    topics_l = []



	    print u'开始处理关注的人'
	    for followee in followees:
		followee_l.append(followee.user_url.split('/')[4])
		if len(followee_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..."
		time.sleep(0.05)
	    followee_list  = ','.join(followee_l)
	    print u'添加完成'

	    print u'开始添加关注者至处理队列'
	    for follower in followers:
		follower_l.append(follower.user_url.split('/')[4])
		urls = urllist.distinct("user_url")
		if follower.user_url in urls:
		    pass
		    # print "follower_url:'"+follower.user_url+"' passed"
		else:
		    urllist.insert({"user_url": follower.user_url, "jlzt": "1"})
		    #print "follower_url:'" +follower.user_url + "' added"
		if len(follower_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..."
		time.sleep(0.05)


	    print u'添加完成'

	    for q in questions:
		questions_l.append("url=" + q.url + "|title=" + q.get_title())
		time.sleep(0.01)
	    for a in answers:
		answers_l.append(a.answer_url)
		time.sleep(0.01)
	    for t in topics:
		topics_l.append(t)
		time.sleep(0.01)

	    user_data = {"zhihu_id":zhihu_id,
			"zhihu_name":zhihu_name,
			"followees_num":followees_num,
			"followers_num":followers_num,
			"followees":followee_l,
			"followers":follower_l,
			"questions":questions_l,
			"gender":gender,
			"asks_num":asks_num,
			"answers_num":answers_num,
			"ansers":answers_l,
			"collections_num":collections_num,
			"agree_num":agree_num,
			"thanks_num":thanks_num,
			"topics_num":topics_num,
			"topics":topics_l,
			"head_img_url":head_img_url
			}

	    print "user_data prepared:"

	    urls = userlist.distinct("user_url")

	    if user_url in urls:
		     pass
	    else:
		userlist.insert(user_data)
		print "user_data inserted: \n"

	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}})
	    endtime = datetime.datetime.now()
	    interval=(endtime - starttime).seconds
	    print zhihu_name + "finnished. spent " + str(interval) + "seconds."
	except:
            traceback.print_exc() 
	    time.sleep(10)
	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}})
            continue

    print "处理完毕"