def crawler(live_name, live_url, mysql):
    browser = Chrome(cache_path=r"E:\temp")
    browser.get(live_url)  # 访问目标虎牙主播的直播间

    time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
    table_name = "huya_{}".format(time_string)

    sql_create = "CREATE TABLE live_barrage.`huya_{}` (" \
                 "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                 "`type` char(10) DEFAULT NULL COMMENT '弹幕类型'," \
                 "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                 " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                 " `user_noble` int(11) DEFAULT NULL COMMENT '弹幕发布者贵族等级'," \
                 " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                 " `gift_name` varchar(40) DEFAULT NULL COMMENT '赠送礼物名称'," \
                 " `gift_num` int(11) DEFAULT '0' COMMENT '赠送礼物数量'," \
                 " `other` varchar(60) DEFAULT NULL COMMENT '弹幕其他信息'," \
                 " PRIMARY KEY (`bid`)" \
                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='虎牙弹幕({})';"
    mysql.create(sql_create.format(time_string, live_name))

    print("开始抓取虎牙直播弹幕.....")

    total_time = 0
    total_num = 0

    data_id_max = 0
    for num in range(int(36000 / 0.5)):

        start_time = time.time()

        label_html = browser.find_element_by_id(
            "chat-room__list").get_attribute("innerHTML")
        bs = BeautifulSoup(label_html, 'lxml')  # 将网页内容解析为Soup对象

        barrage_list = []
        for label in bs.select("li"):
            data_id = int(label["data-id"])  # 提取:弹幕ID

            if data_id <= data_id_max:  # 依据弹幕的ID判断弹幕是否还未抓取
                if data_id > data_id_max - 101:
                    continue
            data_id_max = data_id

            barrage_info = {
                "bid": data_id,  # 弹幕ID
                "type": "",  # 弹幕所属类型
                "user_name": "",  # 弹幕发布者名称
                "user_noble": 0,  # 弹幕发布者贵族等级
                "content": "",  # 弹幕内容
                "gift_name": "",  # 礼物名称
                "gift_num": 0,  # 礼物数量
                "other": ""  # 其他信息
            }

            category = str(label.select_one("li > div")["class"])  # 提取:弹幕类型
            if "msg-smog" in category:  # 处理smog类型弹幕(普通弹幕)
                barrage_info["type"] = "SG"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(1)").text
                barrage_info["content"] = label.select_one(
                    "li > div > span:nth-child(3)").text
            elif "msg-normal" in category:  # 处理普通类型弹幕(普通弹幕)
                barrage_info["type"] = "NM"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(2)").text
                barrage_info["content"] = label.select_one(
                    "li > div > span:nth-child(5)").text
            elif "msg-nobleEnter" in category:  # 处理nobleEnter类型弹幕(贵族进入弹幕)
                barrage_info["type"] = "NE"
                barrage_info["user_name"] = label.select_one(
                    "li > div > div > p > span:nth-child(1)").text
                barrage_info["user_noble"] = label.select_one(
                    "li > div > div")["class"]
                barrage_info["content"] = "驾临直播间"
            elif "msg-nobleSpeak" in category:  # 处理nobleSpeak类型弹幕(贵族发言)
                barrage_info["type"] = "NS"
                barrage_info["user_name"] = label.select_one(
                    "li > div > p > span:nth-child(2)").text
                barrage_info["user_noble"] = int(
                    label.select_one("li > div")["class"])
                barrage_info["content"] = label.select_one(
                    "li > div > p > span:nth-child(5)").text
            elif "tit-h-send" in category:  # 处理send类型提示(礼物赠送提示)
                barrage_info["type"] = "SD"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(1)").text
                barrage_info["gift_name"] = label.select_one(
                    "li > div > span:nth-child(3) > img")["alt"]
                barrage_info["gift_num"] = int(
                    label.select_one(
                        "li > div > span:nth-child(4) > img").text)
            elif "msg-onTVLottery" in category:
                barrage_info["type"] = "TV"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(2)").text
                barrage_info["content"] = label.select_one(
                    "li > div > div > span").text
            elif "msg-auditorSys" in category:  # 处理msg-auditorSys类型提示(系统提示)
                barrage_info["type"] = "AS"
                barrage_info["other"] = label.text
            elif "msg-sys" in category:  # 处理msg-sys类型提示(系统提示)
                barrage_info["type"] = "SY"
                barrage_info["other"] = label.text
            else:  # 处理其他类型
                barrage_info.update(type="OT", other="弹幕名称" + category)
            barrage_list.append(barrage_info)

        mysql.insert(table_name, barrage_list)

        total_num += 1
        total_time += 1000 * (time.time() - start_time)

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
              ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
              round(total_time), "/", total_num, ")")
Beispiel #2
0
def crawler(live_name, live_url, mysql):
    browser = Chrome(cache_path=r"E:\Temp")  # 打开Chrome浏览器
    browser.get(live_url)  # 访问目标斗鱼主播的直播间
    time.sleep(10)

    time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
    table_name = "douyu_{}".format(time_string)

    sql_create = "CREATE TABLE live_barrage.`douyu_{}` (" \
                 "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                 "`type` varchar(60) DEFAULT NULL COMMENT '弹幕类型'," \
                 "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                 " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                 " `user_level` int(11) DEFAULT NULL COMMENT '弹幕发布者等级'," \
                 " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                 " `text` varchar(100) DEFAULT NULL COMMENT '弹幕其他信息'," \
                 " PRIMARY KEY (`bid`)" \
                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='斗鱼弹幕({})';"
    mysql.create(sql_create.format(time_string, live_name))

    print("开始抓取斗鱼直播弹幕.....")

    total_time = 0
    total_num = 0
    # screenshot = 0

    barrage_id_list = list()

    data_id_max = 0
    for num in range(int(36000 / 0.5)):

        start_time = time.time()

        label_html = browser.find_element_by_id(
            "js-barrage-list").get_attribute("innerHTML")
        soup = BeautifulSoup(label_html, 'lxml')  # 将网页内容解析为Soup对象

        barrage_list = []
        for label in soup.select("li"):

            bid = str(label["id"])  # 提取:弹幕ID

            if bid in barrage_id_list:
                continue
            barrage_id_list.append(bid)

            if len(barrage_id_list) > 200:
                barrage_id_list.remove(barrage_id_list[0])

            barrage_info = {
                "type": "",  # 弹幕所属类型
                "user_name": "",  # 弹幕发布者名称
                "user_level": 0,  # 弹幕发布者等级
                "content": "",  # 弹幕内容
                "text": ""  # 其他信息
            }

            type_class = label.select_one("li > div")["class"]
            if "Barrage-notice" in type_class and "normalBarrage" not in type_class:
                barrage_info["type"] = "NOTICE"
            elif "normalBarrage" in type_class:
                barrage_info["type"] = "NORMAL"
            elif "Barrage-userEnter" in type_class:
                barrage_info["type"] = "ENTER"
            elif "Barrage-message" in type_class:
                barrage_info["type"] = "MESSAGE"

            for info_label in label.select("li > div > span"):
                info_label_class = info_label["class"]
                if "UserLevel" in info_label_class:
                    barrage_info["user_level"] = re.search(
                        "[0-9]+", info_label["title"]).group()
                elif "Barrage-nickName" in info_label_class:
                    barrage_info["user_name"] = info_label.text.replace(
                        " ", "")
                elif "Barrage-content" in info_label_class:
                    barrage_info["content"] = info_label.text.replace(" ", "")
                elif "Barrage-text" in info_label_class:
                    barrage_info["text"] = info_label.text.replace(" ", "")

            barrage_list.append(barrage_info)

        if len(barrage_list) < 200:

            mysql.insert(table_name, barrage_list)

            total_num += 1
            total_time += 1000 * (time.time() - start_time)

            print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
                  ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
                  round(total_time), "/", total_num, ")")

        else:

            total_num += 1
            total_time += 1000 * (time.time() - start_time)

            print("本次时间范围内弹幕列表未自动向下滚动...")

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        data_id_max += len(barrage_list)