def fetch_pages(keyword, start_time, end_time):
    """
    使用beatifulsoul获取网页sorce代码,来获取页码信息,获取页码信息后,使用fetch_weibo_data来循环爬取每个面信息
    返回0(爬取失败) or blogs(爬取成功的list数据)
    """
    mycookie = ""
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', "cookie": mycookie}

    resp = requests.get(url_template.format(keyword, start_time, end_time, '1'), headers=headers, allow_redirects=False)

    if resp.status_code == 200:
        print("fetch_pages爬取状态正常")
    else:
        print("!!!爬取状态异常:", resp.status_code)

    if not resp.raise_for_status():
        print("网页请求无报错")  # 网页请求报错
    else:
        print("!!!网页请求出错:", resp.raise_for_status())
        print("!!!网页请求历史", resp.history)

    soup = BeautifulSoup(resp.text, 'lxml')


    if str(soup.select_one('.card-wrap').select_one('p').text).startswith('抱歉'):  # 此次搜索条件的判断,如果没有相关搜索结果!退出...
        print("搜索条件无相关结果...")
        return 0
    try:
        page_num = len(soup.select_one('.m-page').select('li'))  # 获取此时间单位内的搜索页面的总数量,
        page_num = int(page_num)
        print(start_time + ' 到 ' + end_time + " 时间单位内搜索结果页面总数为:%d" % page_num)
    except Exception:
        page_num = 1

    mblogs = []
    for page_id in range(page_num):
        page_id += 1
        try:
            mblogs.extend(fetch_weibo_data(keyword, start_time, end_time, page_id, page_num))  # 每页调用fetch_data函数进行微博信息的抓取
        except Exception as e:
            print(e)
        time.sleep(1)
    for i in range(len(mblogs)):
        cursor = conn.cursor()
        sql = "insert into jia(wb_id,wb_username,wb_userid,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum,wb_url,wb_mood) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        cursor.execute(sql,(mblogs[i]['wb_id'], mblogs[i]['wb_username'], mblogs[i]['wb_userid'],
                            filter_emoji(mblogs[i]['wb_content']),mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                            mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'],mblogs[i]['wb_url'],mblogs[i]['wb_mood']))
        print('************** %s  数据保存成功 **************' )
        conn.commit()
        cursor.close()
        return mblogs
Ejemplo n.º 2
0
def fetch_pages(keyword, start_time, end_time, cookie):
    cookies = {"Cookie": cookie}
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
    }
    #print(url_template.format(keyword, start_time, end_time, '1'))
    resp = requests.get(url_template.format(keyword, start_time, end_time,
                                            '1'),
                        cookies=cookies,
                        headers=headers)
    soup = BeautifulSoup(resp.text, 'lxml')
    t2 = soup.select_one('.card-wrap').select_one('p').text
    if (str(soup.select_one('.card-wrap').select_one('p').text).startswith(
            '抱歉')):  # 此次搜索条件的判断,如果没有相关搜索结果!退出...
        print("此次搜索条件无相关搜索结果!\n请重新选择条件筛选...")
        return
    else:
        mblogs = []  # 此次时间单位内的搜索全部结果先临时用列表保存,后存入csv
        try:
            # 每个时间段爬取前20页微博
            t = soup.select_one(".s-scroll").select("li")
            page_num = len(t)
            if (page_num > 20):
                page_num = 20
        except:
            page_num = 1
        for page_id in range(page_num):
            page_id = page_id + 1
            try:
                mblogs.extend(
                    fetch_weibo_data(keyword, start_time, end_time, page_id,
                                     cookie))  # 每页调用fetch_data函数进行微博信息的抓取
                time.sleep(2)
            except Exception as e:
                print(e)
        # 写入csv
    with open('test.csv', 'a+', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        i = 0
        for i in range(len(mblogs)):
            row = [
                mblogs[i]['wb_id'], mblogs[i]['wb_username'],
                mblogs[i]['wb_userid'],
                filter_emoji(mblogs[i]['wb_content']),
                mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'],
                mblogs[i]['wb_url']
            ]
            writer.writerow(row)
Ejemplo n.º 3
0
def fetch_comment_data(wbid, userid):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie
    cookies = {"Cookie": cookie}
    proxies = get_random_ip(ip_list)
    r_comment = requests.get('https://weibo.cn/comment/{}'.format(wbid),
                             cookies=cookies,
                             proxies=proxies)
    soup_comment = BeautifulSoup(r_comment.text, 'lxml')
    flag = False
    try:
        flag = soup_comment.select('.c')[-1].text.startswith('还没有人针对')
    except Exception as e:
        page_num = 1

    if flag:
        print("--------- 此微博没有人评论! ---------\n")
        return
    else:
        try:
            page_num = int(
                soup_comment.select_one(".pa").text.split()[-1].split("/")
                [-1].split("页")[0])
        except Exception as e:
            page_num = 1

    mh = mysqlHelper(get_db()[0],
                     get_db()[1],
                     get_db()[2],
                     get_db()[3],
                     get_db()[4], int(get_db()[5]))
    sql = "insert into comment(wb_id,comment_content,comment_userid,comment_username,comment_like,comment_createtime,userid) values(%s,%s,%s,%s,%s,%s,%s)"

    page_id = 1
    commentinfos = []
    print("--------- 此微博 {} 的评论页数共有 {} 页 ---------\n".format(wbid, page_num))
    while page_id < page_num + 1:

        time.sleep(random.uniform(4.5, 6.5))  #设置睡眠时间

        print("++++++ 正在爬取此微博 {} 的第 {} 页评论...... ++++++\n".format(
            wbid, page_id))
        r_comment = requests.get(url_comment.format(wbid, page_id),
                                 cookies=cookies)
        soup_comment = BeautifulSoup(r_comment.text, 'lxml')
        comment_list = soup_comment.select(".c")

        for l in comment_list:
            if str(l.get("id")).startswith("C_"):
                comment_content = filter_emoji(l.select_one(".ctt").text)
                comment_userid = l.select_one("a").get("href")[3:]
                comment_username = l.select_one("a").text
                comment_like = l.select_one(".cc").text.strip()[2:-1]
                comment_createtime = time_process(
                    l.select_one(".ct").text.strip()[:-5])
                print("评论内容  :" + comment_content)
                print("评论用户ID:" + comment_userid)
                print("评论用户名:" + comment_username)
                print("评论赞数  :" + comment_like)
                print("评论时间  :" + comment_createtime)
                print('----------------------------\n')
                commentinfo = {
                    'wb_id': wbid,  # 生成一条评论信息的列表
                    'comment_content': comment_content,
                    'comment_userid': comment_userid,
                    'comment_username': comment_username,
                    'comment_like': comment_like,
                    'comment_createtime': comment_createtime,
                    'userid': userid
                }
                commentinfos.append(commentinfo)

        page_id = page_id + 1

        if (len(commentinfos) >= 100):
            mh.open()
            for i in range(len(commentinfos)):
                mh.cud(sql, (commentinfos[i]['wb_id'],
                             commentinfos[i]['comment_content'],
                             commentinfos[i]['comment_userid'],
                             commentinfos[i]['comment_username'],
                             commentinfos[i]['comment_like'],
                             commentinfos[i]['comment_createtime'], userid))
            mh.tijiao()
            mh.close()
            commentinfos = []

    if (len(commentinfos) > 0):
        mh.open()
        for i in range(len(commentinfos)):
            mh.cud(
                sql,
                (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'],
                 commentinfos[i]['comment_userid'],
                 commentinfos[i]['comment_username'],
                 commentinfos[i]['comment_like'],
                 commentinfos[i]['comment_createtime'], userid))
        mh.tijiao()
        mh.close()

    print("--------- 此微博的全部评论爬取完毕!---------\n\n")
Ejemplo n.º 4
0
def fetch_user_data(user_id, keyword, cookie):
    cookies = {"Cookie": cookie}

    try:
        # 通过' https://weibo.cn/%d '网站获取用户的微博数量、关注数和粉丝数
        url_user = "******" % (user_id)
        r_user = requests.get(url_user, cookies=cookies)
        soup_user = BeautifulSoup(r_user.text, 'lxml')
        user_wbnum = soup_user.select_one('.tc').text
        user_wbnum = re.sub("\D", "", user_wbnum)  # 微博数量
        user_follow = soup_user.select_one('.tip2').select('a')[0].text
        user_follow = re.sub("\D", "", user_follow)  # 关注数量
        user_fan = soup_user.select_one('.tip2').select('a')[1].text
        user_fan = re.sub("\D", "", user_fan)  # 粉丝数量

        all_contents = soup_user.select_one('.ut').select('.ctt')
        if len(all_contents) == 2:
            temp_info = all_contents[0].text.split()
            user_name = temp_info[0]
            user_sex = str(temp_info[1]).split('/')[0]
            user_address = str(temp_info[1]).split('/')[1]
            user_renzheng = '无'
            user_oneword = filter_emoji(all_contents[1].text)

        else:
            temp_info = all_contents[0].text.split()
            user_name = temp_info[0]
            user_sex = str(temp_info[1]).split('/')[0]
            user_address = str(temp_info[1]).split('/')[1]
            user_renzheng = filter_emoji(all_contents[1].text)
            user_oneword = filter_emoji(all_contents[2].text)
        if user_oneword == '':
            user_oneword = '无'
    except Exception as err:
        return

    print('用户ID:' + str(user_id))
    print('用户名:' + user_name)
    print('用户性别:' + user_sex)
    print('用户地址:' + user_address)
    print('用户简介:' + user_oneword)
    print('用户认证:' + user_renzheng)
    print('用户页面的url:' + url_user)
    print('微博数量:' + user_wbnum)
    print('关注数:' + user_follow)
    print('粉丝数:' + user_fan)
    print('关键字:' + keyword)
    print('--------------------------------\n')

    userinfo = []
    user = {
        'user_id': user_id,  # 生成一条用户信息的列表
        'user_name': user_name,
        'user_sex': user_sex,
        'user_address': user_address,
        'user_weizhi': '0',
        'user_renzheng': user_renzheng,
        'user_oneword': user_oneword,
        'user_wbnum': user_wbnum,
        'user_follow': user_follow,
        'user_fan': user_fan,
        'user_url': url_user,
        'keyword': keyword
    }
    userinfo.append(user)
    return userinfo
Ejemplo n.º 5
0
def fetch_weibo_data(wb_userid, wb_username, page_id):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie
    proxies2 = get_random_ip(ip_list)
    cookies = {"Cookie": cookie}
    url_weibo = "https://weibo.cn/%s?page=%d" % (wb_userid, page_id)
    # 通过' https://weibo.cn/%d '网站获取用户微博的信息
    r_weibo = requests.get(url_weibo, cookies=cookies, proxies=proxies2)
    soup_weibo = BeautifulSoup(r_weibo.text, 'lxml')

    all_contents = soup_weibo.select('.c')[1:-2]

    wb_count = 0
    mblog = []  # 保存处理过的微博

    for card in all_contents:
        wb_id = str(card.get('id')).split("_")[1]
        wb_content = filter_emoji(card.select_one('.ctt').text)  # 微博内容

        temp_href = card.select('a')
        for href in temp_href:
            if 'comment' in href.get('href') and '原文评论' not in href.text:
                wb_commentnum = href.text[3:-1]

            if 'attitude' in href.get('href'):
                wb_likenum = href.text[2:-1]

            if 'repost' in href.get('href'):
                wb_forwardnum = href.text[3:-1]

        wb_createtime = time_process(
            card.select_one('.ct').text.split('\xa0')[0])  # 微博内容

        print('用户名:' + wb_username)
        print('用户ID:' + wb_userid)
        print('微博ID:' + wb_id)
        print('微博内容:' + wb_content)
        print('微博评论数:' + wb_commentnum)
        print('微博点赞数:' + wb_likenum)
        print('微博转发数:' + wb_forwardnum)
        print('微博创建时间:' + wb_createtime)
        print('------------------------------\n')

        blog = {
            'wb_userid': wb_userid,  # 生成一条微博记录的列表
            'wb_username': wb_username,
            'wb_id': wb_id,
            'wb_content': wb_content,
            'wb_createtime': wb_createtime,
            'wb_forwardnum': wb_forwardnum,
            'wb_commentnum': wb_commentnum,
            'wb_likenum': wb_likenum
        }
        mblog.append(blog)
        wb_count = wb_count + 1  # 表示此页的微博数
    global flag
    if (wb_count > 0):
        print("---------- 第%s页微博爬取完成,休息1s ---------- " % page_id + "当前页微博数:" +
              str(wb_count))
        flag = 0
        rest = random.uniform(4.5, 6.5)
        time.sleep(rest)
    else:
        flag = 1
        print("********** 第%s页微博开始被反爬,程序自动睡眠10秒后进行爬取...... " % page_id)
        time.sleep(10)
    print()
    print()
    return mblog
Ejemplo n.º 6
0
def fetch_pages(user_id):
    cookie = Cookie_Process.read_cookie()  # 获取文件中存储的cookie5625254784
    cookies = {"Cookie": cookie}

    # 通过' https://weibo.cn/%d '网站微博第一页获取用户的用户名和总页数
    proxies = get_random_ip(ip_list)
    url_user = "******" % (user_id, 1)
    r_user = requests.get(url_user, cookies=cookies, proxies=proxies)
    soup_user = BeautifulSoup(r_user.text, 'lxml')

    # 判断用户是否发表了微博,如没有,则返回
    panduan_weibo = soup_user.select_one('.tc').text[3:-1]
    if panduan_weibo == '0':
        print('此用户微博数量为0!')
        return

    user_contents = soup_user.select_one('.ut').select('.ctt')
    temp_user = user_contents[0].text.split()
    wb_username = temp_user[0]  # 获取微博用户名
    # print(wb_username)
    try:
        page_num = int(
            soup_user.select_one('.pa').text.split()[1].split('/')[1]
            [:-1]) - 1  # 获取微博总页数
        print('--------- 微博总页数为:' + str(page_num) + ' ---------\n')
    except Exception as e:
        page_num = 1

    mblogs = []  # 此次时间单位内的搜索全部结果先临时用列表保存,后存入数据库
    page_id = 1
    while page_id <= page_num:
        try:
            mblogs.extend(fetch_weibo_data(
                user_id, wb_username,
                page_id))  # 每页调用fetch_data函数进行微博信息的抓取,中国日报4466
            if (flag == 1):
                continue

        except Exception as e:
            print(e)
        if (page_id % 20 == 0):  # 每多少条数据执行一次 提交 插入数据库操作
            # 保存到mysql数据库
            mh = mysqlHelper(get_db()[0],
                             get_db()[1],
                             get_db()[2],
                             get_db()[3],
                             get_db()[4], int(get_db()[5]))
            sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)"
            mh.open()
            for i in range(len(mblogs)):
                mh.cud(
                    sql,
                    (mblogs[i]['wb_userid'], mblogs[i]['wb_username'],
                     mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']),
                     mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                     mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum']))
            mh.tijiao()
            mh.close()
            mblogs = []  # 提交数据库之后将列表清空
        page_id = page_id + 1
    if len(mblogs) > 0:  # 将余下的数据提交数据库
        # 保存到mysql数据库
        mh = mysqlHelper(get_db()[0],
                         get_db()[1],
                         get_db()[2],
                         get_db()[3],
                         get_db()[4], int(get_db()[5]))
        sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        mh.open()
        for i in range(len(mblogs)):
            mh.cud(sql,
                   (mblogs[i]['wb_userid'], mblogs[i]['wb_username'],
                    mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']),
                    mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'],
                    mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum']))
        mh.tijiao()
        mh.close()