Ejemplo n.º 1
0
def search_all_user(keyword):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie

    mhf = mysqlHelper(get_db()[0],
                      get_db()[1],
                      get_db()[2],
                      get_db()[3],
                      get_db()[4], int(get_db()[5]))
    sqlf = "select wb_userid from keyword_weibo where keyword = %s "
    all_id_temp = mhf.findAll(sqlf, keyword)  # 查询涉及到关键字的所有的用户ID
    all_id = []  # 对要爬取的用户id列表进行去重
    for i in all_id_temp:
        if i not in all_id:
            all_id.append(i)
    id_num = len(all_id)
    print('查询到涉及关键字为 {} 的微博用户总数量为 {}\n'.format(keyword, str(id_num)))

    mh = mysqlHelper(get_db()[0],
                     get_db()[1],
                     get_db()[2],
                     get_db()[3],
                     get_db()[4], int(get_db()[5]))
    sql = "insert into keyword_userinfo(user_id,user_name,user_sex,user_address,user_weizhi,user_renzheng,user_oneword,user_wbnum,user_follow,user_fan,user_url,keyword) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"

    userinfos = []
    for i in range(len(all_id)):
        temp_user_data = fetch_user_data(int(all_id[i][0]), keyword, cookie)

        if (temp_user_data != None):  # 判断返回的用户数据列表是否为空
            userinfos.extend(temp_user_data)

        if ((i + 1) % 50 == 0):  # 每多少条数据执行一次 提交 插入数据库操作
            mh.open()
            for i in range(len(userinfos)):
                mh.cud(
                    sql,
                    (userinfos[i]['user_id'], userinfos[i]['user_name'],
                     userinfos[i]['user_sex'], userinfos[i]['user_address'],
                     userinfos[i]['user_weizhi'],
                     userinfos[i]['user_renzheng'],
                     userinfos[i]['user_oneword'], userinfos[i]['user_wbnum'],
                     userinfos[i]['user_follow'], userinfos[i]['user_fan'],
                     userinfos[i]['user_url'], keyword))
            mh.tijiao()
            mh.close()
            userinfos = []  # 提交数据库之后将列表清空

    if len(userinfos) > 0:  # 将余下的数据提交数据库
        mh.open()
        for i in range(len(userinfos)):
            mh.cud(sql,
                   (userinfos[i]['user_id'], userinfos[i]['user_name'],
                    userinfos[i]['user_sex'], userinfos[i]['user_address'],
                    userinfos[i]['user_weizhi'], userinfos[i]['user_renzheng'],
                    userinfos[i]['user_oneword'], userinfos[i]['user_wbnum'],
                    userinfos[i]['user_follow'], userinfos[i]['user_fan'],
                    userinfos[i]['user_url'], keyword))
        mh.tijiao()
        mh.close()
Ejemplo n.º 2
0
def search_all_comment(keyword):
    cookie = Cookie_Process.read_cookie()   # 获取文件中存储的cookie

    mhf = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5]))
    sqlf = "select wb_id from keyword_weibo where keyword = %s "
    all_wbid_temp = mhf.findAll(sqlf, keyword)   #查询涉及到关键字的所有的微博id
    all_wbid = []                   # 对要爬取的微博id列表进行去重
    for i in all_wbid_temp:
        wbid = i[0]
        # print(wbid)
        if wbid not in all_wbid:
            all_wbid.append(wbid)
    wb_num = len(all_wbid)
    print('查询到涉及关键字为 {} 的微博总数量为 {}\n'.format(keyword,str(wb_num)))

    for i in range(len(all_wbid)):
        print("============ 正在爬取第 {} 条微博的评论 ===========".format(int(i)+1))
        fetch_comment_data(str(all_wbid[i]),keyword,cookie)
Ejemplo n.º 3
0
def search_all_comment_id(userid):
    mhf = mysqlHelper(get_db()[0],
                      get_db()[1],
                      get_db()[2],
                      get_db()[3],
                      get_db()[4], int(get_db()[5]))
    sqlf = "select wb_id from user_weibo where wb_userid =%s"
    all_wbid_temp = mhf.findAll(sqlf, userid)  #查询涉及到关键字的所有的微博id
    all_wbid = []  # 对要爬取的微博id列表进行去重
    for i in all_wbid_temp:
        wbid = i[0]
        # print(wbid)
        if wbid not in all_wbid:
            all_wbid.append(wbid)
    wb_num = len(all_wbid)
    print("这个家伙有{}条微博已经被爬下来了\n".format(str(wb_num)))
    for i in range(len(all_wbid)):
        print("============ 正在爬取第 {} 条微博的评论 ===========".format(int(i) + 1))
        fetch_comment_data(str(all_wbid[i]), userid)
Ejemplo n.º 4
0
def fetch_comment_data(wbid, userid):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie
    cookies = {"Cookie": cookie}
    proxies = get_random_ip(ip_list)
    r_comment = requests.get('https://weibo.cn/comment/{}'.format(wbid),
                             cookies=cookies,
                             proxies=proxies)
    soup_comment = BeautifulSoup(r_comment.text, 'lxml')
    flag = False
    try:
        flag = soup_comment.select('.c')[-1].text.startswith('还没有人针对')
    except Exception as e:
        page_num = 1

    if flag:
        print("--------- 此微博没有人评论! ---------\n")
        return
    else:
        try:
            page_num = int(
                soup_comment.select_one(".pa").text.split()[-1].split("/")
                [-1].split("页")[0])
        except Exception as e:
            page_num = 1

    mh = mysqlHelper(get_db()[0],
                     get_db()[1],
                     get_db()[2],
                     get_db()[3],
                     get_db()[4], int(get_db()[5]))
    sql = "insert into comment(wb_id,comment_content,comment_userid,comment_username,comment_like,comment_createtime,userid) values(%s,%s,%s,%s,%s,%s,%s)"

    page_id = 1
    commentinfos = []
    print("--------- 此微博 {} 的评论页数共有 {} 页 ---------\n".format(wbid, page_num))
    while page_id < page_num + 1:

        time.sleep(random.uniform(4.5, 6.5))  #设置睡眠时间

        print("++++++ 正在爬取此微博 {} 的第 {} 页评论...... ++++++\n".format(
            wbid, page_id))
        r_comment = requests.get(url_comment.format(wbid, page_id),
                                 cookies=cookies)
        soup_comment = BeautifulSoup(r_comment.text, 'lxml')
        comment_list = soup_comment.select(".c")

        for l in comment_list:
            if str(l.get("id")).startswith("C_"):
                comment_content = filter_emoji(l.select_one(".ctt").text)
                comment_userid = l.select_one("a").get("href")[3:]
                comment_username = l.select_one("a").text
                comment_like = l.select_one(".cc").text.strip()[2:-1]
                comment_createtime = time_process(
                    l.select_one(".ct").text.strip()[:-5])
                print("评论内容  :" + comment_content)
                print("评论用户ID:" + comment_userid)
                print("评论用户名:" + comment_username)
                print("评论赞数  :" + comment_like)
                print("评论时间  :" + comment_createtime)
                print('----------------------------\n')
                commentinfo = {
                    'wb_id': wbid,  # 生成一条评论信息的列表
                    'comment_content': comment_content,
                    'comment_userid': comment_userid,
                    'comment_username': comment_username,
                    'comment_like': comment_like,
                    'comment_createtime': comment_createtime,
                    'userid': userid
                }
                commentinfos.append(commentinfo)

        page_id = page_id + 1

        if (len(commentinfos) >= 100):
            mh.open()
            for i in range(len(commentinfos)):
                mh.cud(sql, (commentinfos[i]['wb_id'],
                             commentinfos[i]['comment_content'],
                             commentinfos[i]['comment_userid'],
                             commentinfos[i]['comment_username'],
                             commentinfos[i]['comment_like'],
                             commentinfos[i]['comment_createtime'], userid))
            mh.tijiao()
            mh.close()
            commentinfos = []

    if (len(commentinfos) > 0):
        mh.open()
        for i in range(len(commentinfos)):
            mh.cud(
                sql,
                (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'],
                 commentinfos[i]['comment_userid'],
                 commentinfos[i]['comment_username'],
                 commentinfos[i]['comment_like'],
                 commentinfos[i]['comment_createtime'], userid))
        mh.tijiao()
        mh.close()

    print("--------- 此微博的全部评论爬取完毕!---------\n\n")
Ejemplo n.º 5
0
def fetch_pages(user_id):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie5625254784
    cookies = {"Cookie": cookie}

    # 通过' https://weibo.cn/%d '网站微博第一页获取用户的用户名和总页数
    proxies = get_random_ip(ip_list)
    url_user = "******" % (user_id, 1)
    r_user = requests.get(url_user, cookies=cookies, proxies=proxies)
    soup_user = BeautifulSoup(r_user.text, 'lxml')

    # 判断用户是否发表了微博,如没有,则返回
    panduan_weibo = soup_user.select_one('.tc').text[3:-1]
    if panduan_weibo == '0':
        print('此用户微博数量为0!')
        return

    user_contents = soup_user.select_one('.ut').select('.ctt')
    temp_user = user_contents[0].text.split()
    wb_username = temp_user[0]  # 获取微博用户名
    # print(wb_username)
    try:
        page_num = int(
            soup_user.select_one('.pa').text.split()[1].split('/')[1]
            [:-1]) - 1  # 获取微博总页数
        print('--------- 微博总页数为:' + str(page_num) + ' ---------\n')
    except Exception as e:
        page_num = 1

    mblogs = []  # 此次时间单位内的搜索全部结果先临时用列表保存,后存入数据库
    page_id = 1
    while page_id <= page_num:
        try:
            mblogs.extend(fetch_weibo_data(
                user_id, wb_username,
                page_id))  # 每页调用fetch_data函数进行微博信息的抓取,中国日报4466
            if (flag == 1):
                continue

        except Exception as e:
            print(e)
        if (page_id % 20 == 0):  # 每多少条数据执行一次 提交 插入数据库操作
            # 保存到mysql数据库
            mh = mysqlHelper(get_db()[0],
                             get_db()[1],
                             get_db()[2],
                             get_db()[3],
                             get_db()[4], int(get_db()[5]))
            sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)"
            mh.open()
            for i in range(len(mblogs)):
                mh.cud(
                    sql,
                    (mblogs[i]['wb_userid'], mblogs[i]['wb_username'],
                     mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']),
                     mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                     mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum']))
            mh.tijiao()
            mh.close()
            mblogs = []  # 提交数据库之后将列表清空
        page_id = page_id + 1
    if len(mblogs) > 0:  # 将余下的数据提交数据库
        # 保存到mysql数据库
        mh = mysqlHelper(get_db()[0],
                         get_db()[1],
                         get_db()[2],
                         get_db()[3],
                         get_db()[4], int(get_db()[5]))
        sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        mh.open()
        for i in range(len(mblogs)):
            mh.cud(sql,
                   (mblogs[i]['wb_userid'], mblogs[i]['wb_username'],
                    mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']),
                    mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                    mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum']))
        mh.tijiao()
        mh.close()