def crawler(live_name, live_url, mysql): browser = Chrome(cache_path=r"E:\temp") browser.get(live_url) # 访问目标虎牙主播的直播间 time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time())) table_name = "huya_{}".format(time_string) sql_create = "CREATE TABLE live_barrage.`huya_{}` (" \ "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \ "`type` char(10) DEFAULT NULL COMMENT '弹幕类型'," \ "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \ " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \ " `user_noble` int(11) DEFAULT NULL COMMENT '弹幕发布者贵族等级'," \ " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \ " `gift_name` varchar(40) DEFAULT NULL COMMENT '赠送礼物名称'," \ " `gift_num` int(11) DEFAULT '0' COMMENT '赠送礼物数量'," \ " `other` varchar(60) DEFAULT NULL COMMENT '弹幕其他信息'," \ " PRIMARY KEY (`bid`)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='虎牙弹幕({})';" mysql.create(sql_create.format(time_string, live_name)) print("开始抓取虎牙直播弹幕.....") total_time = 0 total_num = 0 data_id_max = 0 for num in range(int(36000 / 0.5)): start_time = time.time() label_html = browser.find_element_by_id( "chat-room__list").get_attribute("innerHTML") bs = BeautifulSoup(label_html, 'lxml') # 将网页内容解析为Soup对象 barrage_list = [] for label in bs.select("li"): data_id = int(label["data-id"]) # 提取:弹幕ID if data_id <= data_id_max: # 依据弹幕的ID判断弹幕是否还未抓取 if data_id > data_id_max - 101: continue data_id_max = data_id barrage_info = { "bid": data_id, # 弹幕ID "type": "", # 弹幕所属类型 "user_name": "", # 弹幕发布者名称 "user_noble": 0, # 弹幕发布者贵族等级 "content": "", # 弹幕内容 "gift_name": "", # 礼物名称 "gift_num": 0, # 礼物数量 "other": "" # 其他信息 } category = str(label.select_one("li > div")["class"]) # 提取:弹幕类型 if "msg-smog" in category: # 处理smog类型弹幕(普通弹幕) barrage_info["type"] = "SG" barrage_info["user_name"] = label.select_one( "li > div > span:nth-child(1)").text barrage_info["content"] = label.select_one( "li > div > span:nth-child(3)").text elif "msg-normal" in category: # 处理普通类型弹幕(普通弹幕) barrage_info["type"] = "NM" barrage_info["user_name"] = label.select_one( "li > div > span:nth-child(2)").text barrage_info["content"] = label.select_one( "li > div > span:nth-child(5)").text elif "msg-nobleEnter" in category: # 处理nobleEnter类型弹幕(贵族进入弹幕) barrage_info["type"] = "NE" barrage_info["user_name"] = label.select_one( "li > div > div > p > span:nth-child(1)").text barrage_info["user_noble"] = label.select_one( "li > div > div")["class"] barrage_info["content"] = "驾临直播间" elif "msg-nobleSpeak" in category: # 处理nobleSpeak类型弹幕(贵族发言) barrage_info["type"] = "NS" barrage_info["user_name"] = label.select_one( "li > div > p > span:nth-child(2)").text barrage_info["user_noble"] = int( label.select_one("li > div")["class"]) barrage_info["content"] = label.select_one( "li > div > p > span:nth-child(5)").text elif "tit-h-send" in category: # 处理send类型提示(礼物赠送提示) barrage_info["type"] = "SD" barrage_info["user_name"] = label.select_one( "li > div > span:nth-child(1)").text barrage_info["gift_name"] = label.select_one( "li > div > span:nth-child(3) > img")["alt"] barrage_info["gift_num"] = int( label.select_one( "li > div > span:nth-child(4) > img").text) elif "msg-onTVLottery" in category: barrage_info["type"] = "TV" barrage_info["user_name"] = label.select_one( "li > div > span:nth-child(2)").text barrage_info["content"] = label.select_one( "li > div > div > span").text elif "msg-auditorSys" in category: # 处理msg-auditorSys类型提示(系统提示) barrage_info["type"] = "AS" barrage_info["other"] = label.text elif "msg-sys" in category: # 处理msg-sys类型提示(系统提示) barrage_info["type"] = "SY" barrage_info["other"] = label.text else: # 处理其他类型 barrage_info.update(type="OT", other="弹幕名称" + category) barrage_list.append(barrage_info) mysql.insert(table_name, barrage_list) total_num += 1 total_time += 1000 * (time.time() - start_time) wait_time = 0.5 if wait_time > (time.time() - start_time): time.sleep(0.5 - (time.time() - start_time)) print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max, ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(", round(total_time), "/", total_num, ")")
def crawler(live_name, live_url, mysql): browser = Chrome(cache_path=r"E:\Temp") # 打开Chrome浏览器 browser.get(live_url) # 访问目标斗鱼主播的直播间 time.sleep(10) time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time())) table_name = "douyu_{}".format(time_string) sql_create = "CREATE TABLE live_barrage.`douyu_{}` (" \ "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \ "`type` varchar(60) DEFAULT NULL COMMENT '弹幕类型'," \ "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \ " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \ " `user_level` int(11) DEFAULT NULL COMMENT '弹幕发布者等级'," \ " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \ " `text` varchar(100) DEFAULT NULL COMMENT '弹幕其他信息'," \ " PRIMARY KEY (`bid`)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='斗鱼弹幕({})';" mysql.create(sql_create.format(time_string, live_name)) print("开始抓取斗鱼直播弹幕.....") total_time = 0 total_num = 0 # screenshot = 0 barrage_id_list = list() data_id_max = 0 for num in range(int(36000 / 0.5)): start_time = time.time() label_html = browser.find_element_by_id( "js-barrage-list").get_attribute("innerHTML") soup = BeautifulSoup(label_html, 'lxml') # 将网页内容解析为Soup对象 barrage_list = [] for label in soup.select("li"): bid = str(label["id"]) # 提取:弹幕ID if bid in barrage_id_list: continue barrage_id_list.append(bid) if len(barrage_id_list) > 200: barrage_id_list.remove(barrage_id_list[0]) barrage_info = { "type": "", # 弹幕所属类型 "user_name": "", # 弹幕发布者名称 "user_level": 0, # 弹幕发布者等级 "content": "", # 弹幕内容 "text": "" # 其他信息 } type_class = label.select_one("li > div")["class"] if "Barrage-notice" in type_class and "normalBarrage" not in type_class: barrage_info["type"] = "NOTICE" elif "normalBarrage" in type_class: barrage_info["type"] = "NORMAL" elif "Barrage-userEnter" in type_class: barrage_info["type"] = "ENTER" elif "Barrage-message" in type_class: barrage_info["type"] = "MESSAGE" for info_label in label.select("li > div > span"): info_label_class = info_label["class"] if "UserLevel" in info_label_class: barrage_info["user_level"] = re.search( "[0-9]+", info_label["title"]).group() elif "Barrage-nickName" in info_label_class: barrage_info["user_name"] = info_label.text.replace( " ", "") elif "Barrage-content" in info_label_class: barrage_info["content"] = info_label.text.replace(" ", "") elif "Barrage-text" in info_label_class: barrage_info["text"] = info_label.text.replace(" ", "") barrage_list.append(barrage_info) if len(barrage_list) < 200: mysql.insert(table_name, barrage_list) total_num += 1 total_time += 1000 * (time.time() - start_time) print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max, ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(", round(total_time), "/", total_num, ")") else: total_num += 1 total_time += 1000 * (time.time() - start_time) print("本次时间范围内弹幕列表未自动向下滚动...") wait_time = 0.5 if wait_time > (time.time() - start_time): time.sleep(0.5 - (time.time() - start_time)) data_id_max += len(barrage_list)