Ejemplo n.º 1
0
def user_spider(user_url):
    database_name = 'wjw_zhihu'
    table_name = 'user_info'

    # 设置数据库连接
    conn=pymysql.connect(host='localhost',user='******',passwd='root',port=3306)
    cur=conn.cursor()
    # 选择数据库
    conn.select_db(database_name)
    # 设置编码, 否则插入数据库乱码
    cur.execute('set names utf8')

    # 设置Redis链接, 记录爬过的user_unique
    redis_conn = redis.Redis(host='127.0.0.1', port=6379, db=0)

    # 获取当前用户信息
    user = User(user_url)
    user_unique = user.get_user_unique()
    if redis_conn.get(get_user_redis_key(user_unique)) == None:
        user_info = user.get_user_info()
        # print user_info;
        # sys.exit()

        # 将用户数据插入数据库
        try:
            insert_sql = prepare_insert_sql(table_name, user_info)
            res=cur.execute(insert_sql)
            conn.commit()       # commit之后才能真正提交到数据库
            redis_conn.set(get_user_redis_key(user_unique), 1)  #设置redis缓存, 防止重爬
            print(user_info['user_unique'] + '  ------  ' + str(res))
        except Exception as e:
            # 打印日志, 记录异常信息
            exceptMsg = str(e)
            print(exceptMsg)


    # 获取该用户关注的人
    followees = user.get_followees()
    # 获取关注该用户的人
    followers = user.get_followers()

    # 统计该用户关注的人
    # i = 0
    # for followee in followees:
    #     print followee.user_url
    #     print followee.get_user_id()
    #     i = i + 1
    #     if i == 41:
    #         break


    # print followers
    # <generator object get_follower at 0x7ffcac3af0f0>
    # 代表所有关注该用户的人的生成器对象
    i = 0
    for follower in followers:
        i = i + 1
        if i % 10 == 0:
            redis_conn.save()   # 将数据写回磁盘。保存时阻塞
            time.sleep(0.3)

        follower_user_unique = follower.get_user_unique() 
        if redis_conn.get(get_user_redis_key(follower_user_unique)) == None:
            try:
                follower_info = follower.get_user_info()
                follower_insert_sql = prepare_insert_sql(table_name, follower_info)
                res=cur.execute(follower_insert_sql)
                conn.commit()
                redis_conn.set(get_user_redis_key(follower_user_unique), 1)  #设置redis缓存, 防止重爬
                print(follower_info['user_unique'] + '  ------  ' + str(res))
            except Exception as e:
                # 打印日志, 记录异常信息
                exceptMsg = str(e)
                print(exceptMsg)