Beispiel #1
0
def grab(url, threadID):
    print url
    user_grab = User(url)
    followers = user_grab.get_followers()

    for i, user_grab in enumerate(followers):
        user = Users()
        while True:
            try:
                if Users.objects(data_id = user_grab.get_data_id()).count():
                    user = Users.objects(data_id = user_grab.get_data_id()).first()
                break
            except Exception, e:
                logging.error("========error1")
                logging.error(e)
                time.sleep(300)
        user.user_id = user_grab.get_user_id()
        user.data_id = user_grab.get_data_id()
        user.followees_num = user_grab.get_followees_num()
        user.followers_num = user_grab.get_followers_num()
        try:
            user.asks_num = user_grab.get_asks_num()
            user.answers_num = user_grab.get_answers_num()
            user.collections_num = user_grab.get_collections_num()
        except Exception, e:
            logging.error("========error2")
            logging.error(e)
            logging.debug(user_grab.get_user_url())
Beispiel #2
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #3
0
def user_test(user_url):

    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num =user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id # 黄继新
    print followers_num # 614840
    print followees_num # 8408
    print asks_num # 1323
    print answers_num # 786
    print collections_num # 44
    print agree_num # 46387
    print thanks_num # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #4
0
    user_collections = user.get_collections()
    for collection in user_collections:
        # 输出每一个收藏夹的名字
        print collection.get_name()
        # 得到该收藏夹下的前十个回答
        top_answers = collection.get_top_i_answers(10)
        # 把答案内容转成txt,markdown
        for answer in top_answers:
            answer.to_txt()
            answer.to_md()


def main():
    url = "http://www.zhihu.com/question/24269892"
    question_test(url)
    answer_url = "http://www.zhihu.com/question/24269892/answer/29960616"
    answer_test(answer_url)
    user_url = "http://www.zhihu.com/people/jixin"
    user_test(user_url)
    collection_url = "http://www.zhihu.com/collection/36750683"
    collection_test(collection_url)
    test()


if __name__ == '__main__':
    user_url = "https://www.zhihu.com/people/BravoMaooo"
    user = User(user_url)
    followers = user.get_followers()
    for follower in followers:
        print(follower.get_user_id())
Beispiel #5
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender  #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #6
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取用户性别
    user_gender = user.get_gender()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()
    # 获取该用户的头像url
    head_img_url = user.get_head_img_url()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    topics = user.get_topics()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print user_gender #male
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477
    print head_img_url  # https://pic2.zhimg.com/0626f4164009f291b26a79d96c6962c5_l.jpg

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    for topic in topics:
        print topic

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print collections
Beispiel #7
0
def user_spider(user_url):
    database_name = 'wjw_zhihu'
    table_name = 'user_info'

    # 设置数据库连接
    conn=pymysql.connect(host='localhost',user='******',passwd='root',port=3306)
    cur=conn.cursor()
    # 选择数据库
    conn.select_db(database_name)
    # 设置编码, 否则插入数据库乱码
    cur.execute('set names utf8')

    # 设置Redis链接, 记录爬过的user_unique
    redis_conn = redis.Redis(host='127.0.0.1', port=6379, db=0)

    # 获取当前用户信息
    user = User(user_url)
    user_unique = user.get_user_unique()
    if redis_conn.get(get_user_redis_key(user_unique)) == None:
        user_info = user.get_user_info()
        # print user_info;
        # sys.exit()

        # 将用户数据插入数据库
        try:
            insert_sql = prepare_insert_sql(table_name, user_info)
            res=cur.execute(insert_sql)
            conn.commit()       # commit之后才能真正提交到数据库
            redis_conn.set(get_user_redis_key(user_unique), 1)  #设置redis缓存, 防止重爬
            print(user_info['user_unique'] + '  ------  ' + str(res))
        except Exception as e:
            # 打印日志, 记录异常信息
            exceptMsg = str(e)
            print(exceptMsg)


    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()

    # 统计该用户关注的人
    # i = 0
    # for followee in followees:
    #     print followee.user_url
    #     print followee.get_user_id()
    #     i = i + 1
    #     if i == 41:
    #         break


    # print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        i = i + 1
        if i % 10 == 0:
            redis_conn.save()   # 将数据写回磁盘。保存时阻塞
            time.sleep(0.3)

        follower_user_unique = follower.get_user_unique() 
        if redis_conn.get(get_user_redis_key(follower_user_unique)) == None:
            try:
                follower_info = follower.get_user_info()
                follower_insert_sql = prepare_insert_sql(table_name, follower_info)
                res=cur.execute(follower_insert_sql)
                conn.commit()
                redis_conn.set(get_user_redis_key(follower_user_unique), 1)  #设置redis缓存, 防止重爬
                print(follower_info['user_unique'] + '  ------  ' + str(res))
            except Exception as e:
                # 打印日志, 记录异常信息
                exceptMsg = str(e)
                print(exceptMsg)
Beispiel #8
0
        str=follower.get_work()
        print(str)
        if allWork.has_key(str):
            allWork[str]+=1
        else:
            allWork[str]=1
        print json.dumps(allWork, encoding="UTF-8", ensure_ascii=False)
    #to delete bias
    if 'unknown' in allWork:
        del allWork['unknown']

    top5Cities = dict(sorted(allWork.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
    return top5Cities

def getTop5Relations(followers):
    allFollowers={}
    for follower in followers:
        allFollowers[follower.get_user_id()]=follower.get_vote_thank_relation()
        print(allFollowers)

    superFriends = dict(sorted(allFollowers.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
    return superFriends

if __name__ == '__main__':
    #an example to get your friends' city location
    user_url = "https://www.zhihu.com/people/BravoMaooo"
    user = User(user_url)
    followers = user.get_followers()
    dics=getTop5Works(followers)
    print(dics)
    v.plotPie4Top5(dics)
Beispiel #9
0
def user_test(user_url):
    user = User(user_url)
    # 获取用户ID
    user_id = user.get_user_id()
    # 获取该用户的关注者人数
    followers_num = user.get_followers_num()
    # 获取该用户关注的人数
    followees_num = user.get_followees_num()
    # 获取该用户提问的个数
    asks_num = user.get_asks_num()
    # 获取该用户回答的个数
    answers_num = user.get_answers_num()
    # 获取该用户收藏夹个数
    collections_num = user.get_collections_num()
    # 获取该用户获得的赞同数
    agree_num = user.get_agree_num()
    # 获取该用户获得的感谢数
    thanks_num = user.get_thanks_num()

    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()
    # 获取该用户提的问题
    asks = user.get_asks()
    # 获取该用户回答的问题的答案
    answers = user.get_answers()
    # 获取该用户2014年回答的问题的答案
    answers_in_2014 = user.get_answers(begin_date='2014-01-01', end_date='2014-12-31')
    # 获取该用户的收藏夹
    collections = user.get_collections()

    print user_id  # 黄继新
    print followers_num  # 614840
    print followees_num  # 8408
    print asks_num  # 1323
    print answers_num  # 786
    print collections_num  # 44
    print agree_num  # 46387
    print thanks_num  # 11477

    print followees
    # <generator object get_followee at 0x7ffcac3af050>
    # 代表所有该用户关注的人的生成器对象
    i = 0
    for followee in followees:
        print followee.get_user_id()
        i = i + 1
        if i == 41:
            break

    print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        print follower.get_user_id()
        i = i + 1
        if i == 41:
            break

    print asks
    # <generator object get_ask at 0x7ffcab9db780>
    # 代表该用户提的所有问题的生成器对象
    print answers
    # <generator object get_answer at 0x7ffcab9db7d0>
    # 代表该用户回答的所有问题的答案的生成器对象
    print answers_in_2014
    # 代表该用户2014年回答的所有问题的答案的生成器对象
    print collections
Beispiel #10
0
def main():
    client = pymongo.MongoClient("localhost", 27017)
    db = client.zhihu_user
    urllist = db.urllist
    userlist = db.userlist

    origin_users = ["https://www.zhihu.com/people/jixin",
                "https://www.zhihu.com/people/zhang-jia-wei",
		"https://www.zhihu.com/people/zhu-xuan-86",
                "https://www.zhihu.com/people/kaifulee",
		"https://www.zhihu.com/people/e-miao-de-nai-ba"]

    urls = urllist.distinct("user_url")

    for u in origin_users:
        if u in urls:
            pass
        else:
            urllist.insert({"user_url": u,
                            "jlzt": "1"})

    while 1:
        item = urllist.find_one({'jlzt':'1'})
        if item == None:
            print u'已全部处理完成'
            break
        else:
            user_url = item["user_url"]
        
        starttime = datetime.datetime.now()
        urllist.update({"user_url":user_url},{"$set":{"jlzt":"3"}})
	
	try:
	    user = User(user_url)


	    zhihu_id = user.get_data_id()
	    # 用户唯一id
	    zhihu_name = user.get_user_id()
	    # 用户名
	    followees_num = user.get_followees_num()
	    # 用户关注人数
	    followers_num = user.get_followers_num()
	    # 用户关注者人数
	    gender = user.get_gender()
	    # 性别

	    # 提问数
	    asks_num = user.get_asks_num()
	    # 获取该用户回答的个数
	    answers_num = user.get_answers_num()
	    # 获取该用户收藏夹个数
	    collections_num = user.get_collections_num()
	    # 获取该用户获得的赞同数
	    agree_num = user.get_agree_num()
	    # 获取该用户获得的感谢数
	    thanks_num = user.get_thanks_num()
	    # 获取该用户的头像url
	    head_img_url = user.get_head_img_url()
	    # 关注的话题数
	    topics_num = user.get_topics_num()


	    # 获取该用户关注的人
	    followees = user.get_followees()
	    # 获取关注该用户的人
	    followers = user.get_followers()
	    # 获取提出的问题
	    questions = user.get_asks()
	    # 获取回答的问题
	    answers = user.get_answers()
	    # 获取话题
	    topics = user.get_topics()

	    print "start process " + zhihu_name + ";\n"
	    #print zhihu_id # 黄继新
	    #print zhihu_name # 614840
	    #print followees_num # 8408
	    #print followers_num # 1323
	    #print gender # 786
	    #print asks_num # 44
	    #print answers_num # 46387
	    #print collections_num # 11477
	    #print agree_num
	    #print thanks_num
	    #print head_img_url
	    #print topics_num



	    followee_l = []
	    follower_l = []
	    questions_l = []
	    answers_l = []
	    topics_l = []



	    print u'开始处理关注的人'
	    for followee in followees:
		followee_l.append(followee.user_url.split('/')[4])
		if len(followee_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(followee_l)) + " followee is being processed. please wait..."
		time.sleep(0.05)
	    followee_list  = ','.join(followee_l)
	    print u'添加完成'

	    print u'开始添加关注者至处理队列'
	    for follower in followers:
		follower_l.append(follower.user_url.split('/')[4])
		urls = urllist.distinct("user_url")
		if follower.user_url in urls:
		    pass
		    # print "follower_url:'"+follower.user_url+"' passed"
		else:
		    urllist.insert({"user_url": follower.user_url, "jlzt": "1"})
		    #print "follower_url:'" +follower.user_url + "' added"
		if len(follower_l) % 100 == 0:
		    print zhihu_name  + "'s NO." + str(len(follower_l)) + " follower is being processed. please wait..."
		time.sleep(0.05)


	    print u'添加完成'

	    for q in questions:
		questions_l.append("url=" + q.url + "|title=" + q.get_title())
		time.sleep(0.01)
	    for a in answers:
		answers_l.append(a.answer_url)
		time.sleep(0.01)
	    for t in topics:
		topics_l.append(t)
		time.sleep(0.01)

	    user_data = {"zhihu_id":zhihu_id,
			"zhihu_name":zhihu_name,
			"followees_num":followees_num,
			"followers_num":followers_num,
			"followees":followee_l,
			"followers":follower_l,
			"questions":questions_l,
			"gender":gender,
			"asks_num":asks_num,
			"answers_num":answers_num,
			"ansers":answers_l,
			"collections_num":collections_num,
			"agree_num":agree_num,
			"thanks_num":thanks_num,
			"topics_num":topics_num,
			"topics":topics_l,
			"head_img_url":head_img_url
			}

	    print "user_data prepared:"

	    urls = userlist.distinct("user_url")

	    if user_url in urls:
		     pass
	    else:
		userlist.insert(user_data)
		print "user_data inserted: \n"

	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"2"}})
	    endtime = datetime.datetime.now()
	    interval=(endtime - starttime).seconds
	    print zhihu_name + "finnished. spent " + str(interval) + "seconds."
	except:
            traceback.print_exc() 
	    time.sleep(10)
	    urllist.update({"user_url":user_url},{"$set":{"jlzt":"1"}})
            continue

    print "处理完毕"