def fetch_pages(keyword, start_time, end_time): """ 使用beatifulsoul获取网页sorce代码,来获取页码信息,获取页码信息后,使用fetch_weibo_data来循环爬取每个面信息 返回0(爬取失败) or blogs(爬取成功的list数据) """ mycookie = "" headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', "cookie": mycookie} resp = requests.get(url_template.format(keyword, start_time, end_time, '1'), headers=headers, allow_redirects=False) if resp.status_code == 200: print("fetch_pages爬取状态正常") else: print("!!!爬取状态异常:", resp.status_code) if not resp.raise_for_status(): print("网页请求无报错") # 网页请求报错 else: print("!!!网页请求出错:", resp.raise_for_status()) print("!!!网页请求历史", resp.history) soup = BeautifulSoup(resp.text, 'lxml') if str(soup.select_one('.card-wrap').select_one('p').text).startswith('抱歉'): # 此次搜索条件的判断,如果没有相关搜索结果!退出... print("搜索条件无相关结果...") return 0 try: page_num = len(soup.select_one('.m-page').select('li')) # 获取此时间单位内的搜索页面的总数量, page_num = int(page_num) print(start_time + ' 到 ' + end_time + " 时间单位内搜索结果页面总数为:%d" % page_num) except Exception: page_num = 1 mblogs = [] for page_id in range(page_num): page_id += 1 try: mblogs.extend(fetch_weibo_data(keyword, start_time, end_time, page_id, page_num)) # 每页调用fetch_data函数进行微博信息的抓取 except Exception as e: print(e) time.sleep(1) for i in range(len(mblogs)): cursor = conn.cursor() sql = "insert into jia(wb_id,wb_username,wb_userid,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum,wb_url,wb_mood) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql,(mblogs[i]['wb_id'], mblogs[i]['wb_username'], mblogs[i]['wb_userid'], filter_emoji(mblogs[i]['wb_content']),mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'],mblogs[i]['wb_url'],mblogs[i]['wb_mood'])) print('************** %s 数据保存成功 **************' ) conn.commit() cursor.close() return mblogs
def fetch_pages(keyword, start_time, end_time, cookie): cookies = {"Cookie": cookie} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400' } #print(url_template.format(keyword, start_time, end_time, '1')) resp = requests.get(url_template.format(keyword, start_time, end_time, '1'), cookies=cookies, headers=headers) soup = BeautifulSoup(resp.text, 'lxml') t2 = soup.select_one('.card-wrap').select_one('p').text if (str(soup.select_one('.card-wrap').select_one('p').text).startswith( '抱歉')): # 此次搜索条件的判断,如果没有相关搜索结果!退出... print("此次搜索条件无相关搜索结果!\n请重新选择条件筛选...") return else: mblogs = [] # 此次时间单位内的搜索全部结果先临时用列表保存,后存入csv try: # 每个时间段爬取前20页微博 t = soup.select_one(".s-scroll").select("li") page_num = len(t) if (page_num > 20): page_num = 20 except: page_num = 1 for page_id in range(page_num): page_id = page_id + 1 try: mblogs.extend( fetch_weibo_data(keyword, start_time, end_time, page_id, cookie)) # 每页调用fetch_data函数进行微博信息的抓取 time.sleep(2) except Exception as e: print(e) # 写入csv with open('test.csv', 'a+', newline='', encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) i = 0 for i in range(len(mblogs)): row = [ mblogs[i]['wb_id'], mblogs[i]['wb_username'], mblogs[i]['wb_userid'], filter_emoji(mblogs[i]['wb_content']), mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'], mblogs[i]['wb_url'] ] writer.writerow(row)
def fetch_comment_data(wbid, userid): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie cookies = {"Cookie": cookie} proxies = get_random_ip(ip_list) r_comment = requests.get('https://weibo.cn/comment/{}'.format(wbid), cookies=cookies, proxies=proxies) soup_comment = BeautifulSoup(r_comment.text, 'lxml') flag = False try: flag = soup_comment.select('.c')[-1].text.startswith('还没有人针对') except Exception as e: page_num = 1 if flag: print("--------- 此微博没有人评论! ---------\n") return else: try: page_num = int( soup_comment.select_one(".pa").text.split()[-1].split("/") [-1].split("页")[0]) except Exception as e: page_num = 1 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into comment(wb_id,comment_content,comment_userid,comment_username,comment_like,comment_createtime,userid) values(%s,%s,%s,%s,%s,%s,%s)" page_id = 1 commentinfos = [] print("--------- 此微博 {} 的评论页数共有 {} 页 ---------\n".format(wbid, page_num)) while page_id < page_num + 1: time.sleep(random.uniform(4.5, 6.5)) #设置睡眠时间 print("++++++ 正在爬取此微博 {} 的第 {} 页评论...... ++++++\n".format( wbid, page_id)) r_comment = requests.get(url_comment.format(wbid, page_id), cookies=cookies) soup_comment = BeautifulSoup(r_comment.text, 'lxml') comment_list = soup_comment.select(".c") for l in comment_list: if str(l.get("id")).startswith("C_"): comment_content = filter_emoji(l.select_one(".ctt").text) comment_userid = l.select_one("a").get("href")[3:] comment_username = l.select_one("a").text comment_like = l.select_one(".cc").text.strip()[2:-1] comment_createtime = time_process( l.select_one(".ct").text.strip()[:-5]) print("评论内容 :" + comment_content) print("评论用户ID:" + comment_userid) print("评论用户名:" + comment_username) print("评论赞数 :" + comment_like) print("评论时间 :" + comment_createtime) print('----------------------------\n') commentinfo = { 'wb_id': wbid, # 生成一条评论信息的列表 'comment_content': comment_content, 'comment_userid': comment_userid, 'comment_username': comment_username, 'comment_like': comment_like, 'comment_createtime': comment_createtime, 'userid': userid } commentinfos.append(commentinfo) page_id = page_id + 1 if (len(commentinfos) >= 100): mh.open() for i in range(len(commentinfos)): mh.cud(sql, (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'], commentinfos[i]['comment_userid'], commentinfos[i]['comment_username'], commentinfos[i]['comment_like'], commentinfos[i]['comment_createtime'], userid)) mh.tijiao() mh.close() commentinfos = [] if (len(commentinfos) > 0): mh.open() for i in range(len(commentinfos)): mh.cud( sql, (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'], commentinfos[i]['comment_userid'], commentinfos[i]['comment_username'], commentinfos[i]['comment_like'], commentinfos[i]['comment_createtime'], userid)) mh.tijiao() mh.close() print("--------- 此微博的全部评论爬取完毕!---------\n\n")
def fetch_user_data(user_id, keyword, cookie): cookies = {"Cookie": cookie} try: # 通过' https://weibo.cn/%d '网站获取用户的微博数量、关注数和粉丝数 url_user = "******" % (user_id) r_user = requests.get(url_user, cookies=cookies) soup_user = BeautifulSoup(r_user.text, 'lxml') user_wbnum = soup_user.select_one('.tc').text user_wbnum = re.sub("\D", "", user_wbnum) # 微博数量 user_follow = soup_user.select_one('.tip2').select('a')[0].text user_follow = re.sub("\D", "", user_follow) # 关注数量 user_fan = soup_user.select_one('.tip2').select('a')[1].text user_fan = re.sub("\D", "", user_fan) # 粉丝数量 all_contents = soup_user.select_one('.ut').select('.ctt') if len(all_contents) == 2: temp_info = all_contents[0].text.split() user_name = temp_info[0] user_sex = str(temp_info[1]).split('/')[0] user_address = str(temp_info[1]).split('/')[1] user_renzheng = '无' user_oneword = filter_emoji(all_contents[1].text) else: temp_info = all_contents[0].text.split() user_name = temp_info[0] user_sex = str(temp_info[1]).split('/')[0] user_address = str(temp_info[1]).split('/')[1] user_renzheng = filter_emoji(all_contents[1].text) user_oneword = filter_emoji(all_contents[2].text) if user_oneword == '': user_oneword = '无' except Exception as err: return print('用户ID:' + str(user_id)) print('用户名:' + user_name) print('用户性别:' + user_sex) print('用户地址:' + user_address) print('用户简介:' + user_oneword) print('用户认证:' + user_renzheng) print('用户页面的url:' + url_user) print('微博数量:' + user_wbnum) print('关注数:' + user_follow) print('粉丝数:' + user_fan) print('关键字:' + keyword) print('--------------------------------\n') userinfo = [] user = { 'user_id': user_id, # 生成一条用户信息的列表 'user_name': user_name, 'user_sex': user_sex, 'user_address': user_address, 'user_weizhi': '0', 'user_renzheng': user_renzheng, 'user_oneword': user_oneword, 'user_wbnum': user_wbnum, 'user_follow': user_follow, 'user_fan': user_fan, 'user_url': url_user, 'keyword': keyword } userinfo.append(user) return userinfo
def fetch_weibo_data(wb_userid, wb_username, page_id): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie proxies2 = get_random_ip(ip_list) cookies = {"Cookie": cookie} url_weibo = "https://weibo.cn/%s?page=%d" % (wb_userid, page_id) # 通过' https://weibo.cn/%d '网站获取用户微博的信息 r_weibo = requests.get(url_weibo, cookies=cookies, proxies=proxies2) soup_weibo = BeautifulSoup(r_weibo.text, 'lxml') all_contents = soup_weibo.select('.c')[1:-2] wb_count = 0 mblog = [] # 保存处理过的微博 for card in all_contents: wb_id = str(card.get('id')).split("_")[1] wb_content = filter_emoji(card.select_one('.ctt').text) # 微博内容 temp_href = card.select('a') for href in temp_href: if 'comment' in href.get('href') and '原文评论' not in href.text: wb_commentnum = href.text[3:-1] if 'attitude' in href.get('href'): wb_likenum = href.text[2:-1] if 'repost' in href.get('href'): wb_forwardnum = href.text[3:-1] wb_createtime = time_process( card.select_one('.ct').text.split('\xa0')[0]) # 微博内容 print('用户名:' + wb_username) print('用户ID:' + wb_userid) print('微博ID:' + wb_id) print('微博内容:' + wb_content) print('微博评论数:' + wb_commentnum) print('微博点赞数:' + wb_likenum) print('微博转发数:' + wb_forwardnum) print('微博创建时间:' + wb_createtime) print('------------------------------\n') blog = { 'wb_userid': wb_userid, # 生成一条微博记录的列表 'wb_username': wb_username, 'wb_id': wb_id, 'wb_content': wb_content, 'wb_createtime': wb_createtime, 'wb_forwardnum': wb_forwardnum, 'wb_commentnum': wb_commentnum, 'wb_likenum': wb_likenum } mblog.append(blog) wb_count = wb_count + 1 # 表示此页的微博数 global flag if (wb_count > 0): print("---------- 第%s页微博爬取完成,休息1s ---------- " % page_id + "当前页微博数:" + str(wb_count)) flag = 0 rest = random.uniform(4.5, 6.5) time.sleep(rest) else: flag = 1 print("********** 第%s页微博开始被反爬,程序自动睡眠10秒后进行爬取...... " % page_id) time.sleep(10) print() print() return mblog
def fetch_pages(user_id): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie5625254784 cookies = {"Cookie": cookie} # 通过' https://weibo.cn/%d '网站微博第一页获取用户的用户名和总页数 proxies = get_random_ip(ip_list) url_user = "******" % (user_id, 1) r_user = requests.get(url_user, cookies=cookies, proxies=proxies) soup_user = BeautifulSoup(r_user.text, 'lxml') # 判断用户是否发表了微博,如没有,则返回 panduan_weibo = soup_user.select_one('.tc').text[3:-1] if panduan_weibo == '0': print('此用户微博数量为0!') return user_contents = soup_user.select_one('.ut').select('.ctt') temp_user = user_contents[0].text.split() wb_username = temp_user[0] # 获取微博用户名 # print(wb_username) try: page_num = int( soup_user.select_one('.pa').text.split()[1].split('/')[1] [:-1]) - 1 # 获取微博总页数 print('--------- 微博总页数为:' + str(page_num) + ' ---------\n') except Exception as e: page_num = 1 mblogs = [] # 此次时间单位内的搜索全部结果先临时用列表保存,后存入数据库 page_id = 1 while page_id <= page_num: try: mblogs.extend(fetch_weibo_data( user_id, wb_username, page_id)) # 每页调用fetch_data函数进行微博信息的抓取,中国日报4466 if (flag == 1): continue except Exception as e: print(e) if (page_id % 20 == 0): # 每多少条数据执行一次 提交 插入数据库操作 # 保存到mysql数据库 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)" mh.open() for i in range(len(mblogs)): mh.cud( sql, (mblogs[i]['wb_userid'], mblogs[i]['wb_username'], mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']), mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'])) mh.tijiao() mh.close() mblogs = [] # 提交数据库之后将列表清空 page_id = page_id + 1 if len(mblogs) > 0: # 将余下的数据提交数据库 # 保存到mysql数据库 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)" mh.open() for i in range(len(mblogs)): mh.cud(sql, (mblogs[i]['wb_userid'], mblogs[i]['wb_username'], mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']), mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'])) mh.tijiao() mh.close()