예제 #1
0
    def __init__(self, interval, live_name, live_url, mysql):
        super().__init__(interval)
        self.browser = Chrome(cache_path=r"E:\Temp")  # 打开Chrome浏览器
        self.browser.get(live_url)  # 访问目标斗鱼主播的直播间
        time.sleep(10)

        self.mysql = mysql

        time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
        self.table_name = "douyu_{}".format(time_string)

        sql_create = "CREATE TABLE live_barrage.`douyu_{}` (" \
                     "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                     "`type` varchar(60) DEFAULT NULL COMMENT '弹幕类型'," \
                     "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                     " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                     " `user_level` int(11) DEFAULT NULL COMMENT '弹幕发布者等级'," \
                     " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                     " `text` varchar(100) DEFAULT NULL COMMENT '弹幕其他信息'," \
                     " PRIMARY KEY (`bid`)" \
                     ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='斗鱼弹幕({})'"
        mysql.create(sql_create.format(time_string, live_name))

        print("开始抓取斗鱼直播弹幕.....")

        self.total_time = 0
        self.total_num = 0

        self.barrage_id_list = list()

        self.data_id_max = 0
예제 #2
0
    def __init__(self, interval, live_name, live_url, mysql):
        super().__init__(interval)
        self.browser = Chrome(cache_path=r"E:\temp")
        self.browser.get(live_url)  # 访问目标虎牙主播的直播间

        self.mysql = mysql

        time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
        self.table_name = "huya_{}".format(time_string)

        sql_create = "CREATE TABLE live_barrage.`huya_{}` (" \
                     "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                     "`type` char(10) DEFAULT NULL COMMENT '弹幕类型'," \
                     "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                     " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                     " `user_noble` int(11) DEFAULT NULL COMMENT '弹幕发布者贵族等级'," \
                     " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                     " `gift_name` varchar(40) DEFAULT NULL COMMENT '赠送礼物名称'," \
                     " `gift_num` int(11) DEFAULT '0' COMMENT '赠送礼物数量'," \
                     " `other` varchar(60) DEFAULT NULL COMMENT '弹幕其他信息'," \
                     " PRIMARY KEY (`bid`)" \
                     ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='虎牙弹幕({})';"
        self.mysql.create(sql_create.format(time_string, live_name))

        print("开始抓取虎牙直播弹幕.....")

        self.total_time = 0
        self.total_num = 0

        self.data_id_max = 0
예제 #3
0
    def running(self):
        driver = Chrome(cache_path=r"E:\temp")
        driver.get("https://cc.julive.com/project/s")

        bs = BeautifulSoup(driver.page_source, 'lxml')  # 将网页转化为BeautifulSoup结构

        city_dict = dict()
        for element_city in bs.select(
                "body > div.container-5-2.container > div.header-v5.header-v5-2.header-normal > div > div.inn-p > div.city-position.city-tip > div.city-change-list-new > ul > li > ul > li> a"
        ):
            city_name = element_city.text
            city_url = element_city["href"]
            city_dict[city_name] = city_url

        return city_dict
예제 #4
0
def crawler():
    browser = Chrome(cache_path=r"E:\temp")
    browser.get("https://cc.julive.com/project/s")

    bs = BeautifulSoup(browser.page_source, 'lxml')  # 将网页转化为BeautifulSoup结构

    city_dict = dict()
    for element_city in bs.select(
            "body > div.container-5-2.container > div.header-v5.header-v5-2.header-normal > div > div.inn-p > div.city-position.city-tip > div.city-change-list-new > ul > li > ul > li> a"
    ):
        city_name = element_city.text
        city_url = element_city["href"]
        city_dict[city_name] = city_url
        print(city_name, city_url)

    Utils.io.write_json("julive_city_url_20191217.json", city_dict)
예제 #5
0
def crawler(live_list_path):
    driver = Chrome(cache_path=r"E:\Temp")

    account_list = tool.io.load_string(live_list_path)

    spider = SpiderDouyuSubscribe(driver)

    for account_url in account_list.split("\n"):
        text_subscribe = spider.running(account_url)
        print(account_url, text_subscribe)
예제 #6
0
    def running(self):
        driver = Chrome(cache_path=r"E:\temp")  # ChromeDriver可执行文件的路径
        driver.get("http://piaofang.maoyan.com/dashboard/web-heat")
        time.sleep(1)

        res = []

        for movie_label in driver.find_elements_by_css_selector(
                "#app > div > div > div.dashboard-content > div.dashboard-list.dashboard-left.bg > div.movielist-container > div > table > tbody > tr"
        ):
            res.append([
                movie_label.find_element_by_class_name("moviename-index").text,
                movie_label.find_element_by_class_name("moviename-name").text,
                movie_label.find_element_by_class_name("moviename-info").text,
                movie_label.find_element_by_class_name("heat-text").text,
                movie_label.find_element_by_class_name("last-col").text
            ])

        return res
def crawler():
    driver = Chrome(cache_path=r"E:\Temp")

    # 采集城市编码列表
    spider_city_code = SpiderCityCode(driver)
    result1 = spider_city_code.run()
    Utils.io.write_json("anjuke_city_code.json", result1)

    # 采集城市房源数量
    city_code_list = Utils.io.load_json("anjuke_city_code.json")
    city_info_list = Utils.io.load_json("anjuke_city_infor.json", default={})
    spider_city_info = SpiderCityInfo(driver)
    for city_name, city_code in city_code_list.items():
        if city_name not in city_info_list:
            city_info_list[city_name] = spider_city_info.run(city_code=city_code)
            Utils.io.write_json("anjuke_city_info.json", city_info_list)
            time.sleep(2)

    driver.quit()
예제 #8
0
def crawler():
    browser = Chrome(cache_path=r"E:\temp")

    account_list = Utils.io.load_string("huya_account_list.txt")
    for account_url in account_list.split("\n"):
        browser.get(account_url)

        # 读取直播间订阅数量
        text_subscribe = ""
        try:
            label_subscribe = browser.find_element_by_xpath(
                '//*[@id="activityCount"]')
            if label_subscribe is not None:
                text_subscribe = label_subscribe.text
        except NoSuchElementException:
            pass

        # 读取直播间ID
        text_id = ""
        try:
            label_id = browser.find_element_by_css_selector(
                '#J_roomHeader > div.room-hd-l > div.host-info > div.host-detail.J_roomHdDetail > span.host-rid'
            )
            if label_id is not None:
                text_id = label_id.text
        except NoSuchElementException:
            pass

        print(account_url, text_id, text_subscribe)

        time.sleep(3)
예제 #9
0
def crawler():
    driver = Chrome(cache_path=r"E:\temp")

    account_list = tool.io.load_string("huya_account_list.txt")

    spider = SpiderHuyaSubscribe(driver)

    for account_url in account_list.split("\n"):
        text_id, text_subscribe = spider.running(account_url)

        print(account_url, text_id, text_subscribe)

        time.sleep(3)
예제 #10
0
def crawler(file_name):
    """
    LeetCode题目列表爬虫

    :param file_name: 抓取结果存储文件地址
    """
    selenium = Chrome(cache_path=r"E:\Temp")  # 启动Chrome浏览器驱动
    selenium.get(PROBLEMS_SET_URL)  # 打开题库页面

    # 获取题目列表(Json格式)
    problems_all_json = requests.get(PROBLEMS_ALL_API).json()
    print("解析题目总数:", problems_all_json["num_total"])

    # 解析题目列表(生成problem实例列表):key=题目ID,value=题目的problem实例
    result_problems = {}
    for problem in problems_all_json["stat_status_pairs"]:
        problem_elem = Problem(problem)
        result_problems[problem_elem.id] = problem_elem

    # 获取题目标签(Json格式)并将结果写入到题目列表中
    problems_tags_json = requests.get(PROBLEMS_TAGS_API).json()
    for topic in problems_tags_json["topics"]:
        tag_name = topic["translatedName"] if topic[
            "translatedName"] else topic["name"]
        for qid in topic["questions"]:
            if qid in result_problems:
                result_problems[qid].add_tag(tag_name)
            else:
                print("题目ID未找到:", qid, tag_name)

    # 获取题目翻译(Json格式)并将结果写入到题目列表中
    translations_json = selenium.post(GRAPHQL_API,
                                      json.dumps(GRAPHQL_QUERY_TRANSLATIONS),
                                      payload=True)
    for problem in translations_json["data"]["translations"]:
        if (qid := int(problem["questionId"])) in result_problems:
            result_problems[qid].title = problem["title"]
예제 #11
0
def crawler(live_list_path):
    driver = Chrome(cache_path=r"E:\Temp")

    account_list = Utils.io.load_string(live_list_path)

    for account_url in account_list.split("\n"):
        driver.get(account_url)

        time.sleep(3)

        text_subscribe = ""

        for _ in range(10):
            try:
                label_subscribe = driver.find_element_by_xpath(
                    '//*[@id="js-player-title"]/div/div[4]/div/span')
                if label_subscribe.text is not None and label_subscribe.text != "":
                    text_subscribe = label_subscribe.text
                    break
                time.sleep(1)
            except NoSuchElementException:
                time.sleep(1)

        print(account_url, text_subscribe)
예제 #12
0
def crawler(live_name, live_url, mysql):
    driver = Chrome(cache_path=r"E:\Temp")  # 打开Chrome浏览器

    spider_bilibili_barrage = SpiderBilibiliBarrage(driver=driver,
                                                    live_url=live_url)

    # 创建目标数据表
    table_name = "bilibili_{}".format(
        time.strftime("%Y%m%d_%H%M", time.localtime(time.time())))
    sql_create = "CREATE TABLE live_barrage.`{}` (" \
                 "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                 "`type` varchar(60) DEFAULT NULL COMMENT '弹幕类型'," \
                 "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                 " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                 " `user_id` int(11) DEFAULT NULL COMMENT '弹幕发布者等级'," \
                 " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                 " PRIMARY KEY (`bid`)" \
                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='Bilibili弹幕({})';"
    mysql.create(sql_create.format(table_name, live_name))

    print("开始抓取Bilibili直播弹幕.....")

    total_time = 0
    total_num = 0

    barrage_num = 0

    for num in range(36000):
        start_time = time.time()
        barrage_list = spider_bilibili_barrage.running()
        mysql.insert(table_name, barrage_list)

        total_num += 1
        total_time += 1000 * (time.time() - start_time)

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        barrage_num += len(barrage_list)

        print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", barrage_num,
              ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
              round(total_time), "/", total_num, ")")
예제 #13
0
import crawlertool as tool
from Selenium4R import Chrome


class SpiderAnjukeCityCodeList(tool.abc.SingleSpider):
    """
    安居客城市编码列表爬虫
    """
    def __init__(self, driver):
        self.driver = driver

    def running(self) -> List[Dict]:
        self.driver.get("https://www.anjuke.com/sy-city.html")

        result = []
        for city_label in self.driver.find_elements_by_css_selector(
                "body > div.content > div > div.letter_city > ul > li > div > a"
        ):
            city_name = city_label.text
            city_code = city_label.get_attribute("href").replace(
                "https://", "").replace(".anjuke.com/", "")
            result.append({"city_name": city_name, "city_code": city_code})

        return result


if __name__ == "__main__":
    driver = Chrome(cache_path=r"E:\Temp")
    print(SpiderAnjukeCityCodeList(driver).running())
    driver.quit()
                tweet_info["text"] = tweet_content
                tweet_info["replies"] = tweet_replies
                tweet_info["retweets"] = tweet_retweets
                tweet_info["likes"] = tweet_likes
                tweet_list.append(tweet_info)

        # 向下滚动到最下面的一条推文
        if last_label_tweet is not None:
            driver.execute_script("arguments[0].scrollIntoView();", last_label_tweet)  # 滑动到推文标签
            time.sleep(1)
        else:
            break

    return tweet_list


if __name__ == "__main__":
    selenium = Chrome(cache_path=r"E:\temp")

    tweet_template = {
        "tweet_id": None,
        "is_retweet": 0,
        "time": None,
        "text": None,
        "replies": None,
        "retweets": None,
        "likes": None
    }
    tweets = crawler(selenium, "realDonaldTrump", tweet_template,
                     since=dt.date(2020, 9, 10), until=dt.date(2020, 9, 11))  # 闭开区间
예제 #15
0
"""
猫眼网播热度采集

需要第三方模块:
Selenium4R >= 0.0.3

@author: ChangXing
@version: 1.0
@create: 2020.05.26
@revise: -
"""

import time

from Selenium4R import Chrome

if __name__ == "__main__":
    browser = Chrome(cache_path=r"E:\temp")  # ChromeDriver可执行文件的路径
    browser.get("http://piaofang.maoyan.com/dashboard/web-heat")
    time.sleep(1)
    for movie_label in browser.find_elements_by_css_selector(
            "#app > div > div > div.dashboard-content > div.dashboard-list.dashboard-left.bg > div.movielist-container > div > table > tbody > tr"):
        print("排名:", movie_label.find_element_by_class_name("moviename-index").text)
        print("名称:", movie_label.find_element_by_class_name("moviename-name").text)
        print("信息:", movie_label.find_element_by_class_name("moviename-info").text)
        print("信息:", movie_label.find_element_by_class_name("heat-text").text)
        print("信息:", movie_label.find_element_by_class_name("last-col").text)
예제 #16
0
def crawler(live_name, live_url, mysql):
    browser = Chrome(cache_path=r"E:\temp")
    browser.get(live_url)  # 访问目标虎牙主播的直播间

    time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
    table_name = "huya_{}".format(time_string)

    sql_create = "CREATE TABLE live_barrage.`huya_{}` (" \
                 "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                 "`type` char(10) DEFAULT NULL COMMENT '弹幕类型'," \
                 "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                 " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                 " `user_noble` int(11) DEFAULT NULL COMMENT '弹幕发布者贵族等级'," \
                 " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                 " `gift_name` varchar(40) DEFAULT NULL COMMENT '赠送礼物名称'," \
                 " `gift_num` int(11) DEFAULT '0' COMMENT '赠送礼物数量'," \
                 " `other` varchar(60) DEFAULT NULL COMMENT '弹幕其他信息'," \
                 " PRIMARY KEY (`bid`)" \
                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='虎牙弹幕({})';"
    mysql.create(sql_create.format(time_string, live_name))

    print("开始抓取虎牙直播弹幕.....")

    total_time = 0
    total_num = 0

    data_id_max = 0
    for num in range(int(36000 / 0.5)):

        start_time = time.time()

        label_html = browser.find_element_by_id(
            "chat-room__list").get_attribute("innerHTML")
        bs = BeautifulSoup(label_html, 'lxml')  # 将网页内容解析为Soup对象

        barrage_list = []
        for label in bs.select("li"):
            data_id = int(label["data-id"])  # 提取:弹幕ID

            if data_id <= data_id_max:  # 依据弹幕的ID判断弹幕是否还未抓取
                if data_id > data_id_max - 101:
                    continue
            data_id_max = data_id

            barrage_info = {
                "bid": data_id,  # 弹幕ID
                "type": "",  # 弹幕所属类型
                "user_name": "",  # 弹幕发布者名称
                "user_noble": 0,  # 弹幕发布者贵族等级
                "content": "",  # 弹幕内容
                "gift_name": "",  # 礼物名称
                "gift_num": 0,  # 礼物数量
                "other": ""  # 其他信息
            }

            category = str(label.select_one("li > div")["class"])  # 提取:弹幕类型
            if "msg-smog" in category:  # 处理smog类型弹幕(普通弹幕)
                barrage_info["type"] = "SG"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(1)").text
                barrage_info["content"] = label.select_one(
                    "li > div > span:nth-child(3)").text
            elif "msg-normal" in category:  # 处理普通类型弹幕(普通弹幕)
                barrage_info["type"] = "NM"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(2)").text
                barrage_info["content"] = label.select_one(
                    "li > div > span:nth-child(5)").text
            elif "msg-nobleEnter" in category:  # 处理nobleEnter类型弹幕(贵族进入弹幕)
                barrage_info["type"] = "NE"
                barrage_info["user_name"] = label.select_one(
                    "li > div > div > p > span:nth-child(1)").text
                barrage_info["user_noble"] = label.select_one(
                    "li > div > div")["class"]
                barrage_info["content"] = "驾临直播间"
            elif "msg-nobleSpeak" in category:  # 处理nobleSpeak类型弹幕(贵族发言)
                barrage_info["type"] = "NS"
                barrage_info["user_name"] = label.select_one(
                    "li > div > p > span:nth-child(2)").text
                barrage_info["user_noble"] = int(
                    label.select_one("li > div")["class"])
                barrage_info["content"] = label.select_one(
                    "li > div > p > span:nth-child(5)").text
            elif "tit-h-send" in category:  # 处理send类型提示(礼物赠送提示)
                barrage_info["type"] = "SD"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(1)").text
                barrage_info["gift_name"] = label.select_one(
                    "li > div > span:nth-child(3) > img")["alt"]
                barrage_info["gift_num"] = int(
                    label.select_one(
                        "li > div > span:nth-child(4) > img").text)
            elif "msg-onTVLottery" in category:
                barrage_info["type"] = "TV"
                barrage_info["user_name"] = label.select_one(
                    "li > div > span:nth-child(2)").text
                barrage_info["content"] = label.select_one(
                    "li > div > div > span").text
            elif "msg-auditorSys" in category:  # 处理msg-auditorSys类型提示(系统提示)
                barrage_info["type"] = "AS"
                barrage_info["other"] = label.text
            elif "msg-sys" in category:  # 处理msg-sys类型提示(系统提示)
                barrage_info["type"] = "SY"
                barrage_info["other"] = label.text
            else:  # 处理其他类型
                barrage_info.update(type="OT", other="弹幕名称" + category)
            barrage_list.append(barrage_info)

        mysql.insert(table_name, barrage_list)

        total_num += 1
        total_time += 1000 * (time.time() - start_time)

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
              ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
              round(total_time), "/", total_num, ")")
예제 #17
0
            break

    return tweet_list


if __name__ == "__main__":
    setting = Utils.io.load_json(r"E:\Github\ChangxingJiang\setting.json")

    mysql = Utils.db.MySQL(
        host=setting["Huabang"]["host"],
        user=setting["Huabang"]["user"],
        password=setting["Huabang"]["password"],
        database=setting["Huabang"]["database"]
    )

    selenium = Chrome(cache_path=r"E:\temp")  # 打开Selenium控制的Chrome浏览器

    if "Huabang" in env.DATA and "Media List" in env.DATA["Huabang"]:
        for media_item in env.DATA["Huabang"]["Media List"]:
            # if media_item[0] != 211:
            #     continue
            print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-", media_item[3], "(", media_item[2], ")")
            tweet_template = {
                "media_id": media_item[0],
                "media_name": media_item[1],
                "tweet_id": None,
                "is_retweet": 0,
                "time": None,
                "text": None,
                "replies": None,
                "retweets": None,
예제 #18
0
                tweet_info = copy.deepcopy(template)
                tweet_info["tweet_id"] = tweet_id
                tweet_info["time"] = tweet_time
                tweet_info["text"] = tweet_content
                tweet_info["replies"] = tweet_replies
                tweet_info["retweets"] = tweet_retweets
                tweet_info["likes"] = tweet_likes
                tweet_info["from_user"] = tweet_from_user
                tweet_info["from_content"] = tweet_from_content
                tweet_list.append(tweet_info)

        # 向下滚动到最下面的一条推文
        if last_label_tweet is not None:
            driver.execute_script("arguments[0].scrollIntoView();",
                                  last_label_tweet)  # 滑动到推文标签
            time.sleep(1)
        else:
            break

    return tweet_list


if __name__ == "__main__":
    selenium = Chrome()  # 打开Selenium控制的Chrome浏览器
    tweets = crawler(selenium,
                     "appledaily_hk", {},
                     since=dt.date(2020, 7, 20),
                     until=dt.date(2020, 7, 24))
    print(tweets)
예제 #19
0
def crawler(live_name, live_url, mysql):
    browser = Chrome(cache_path=r"E:\Temp")  # 打开Chrome浏览器
    browser.get(live_url)  # 访问目标斗鱼主播的直播间
    time.sleep(10)

    time_string = time.strftime("%Y%m%d_%H%M", time.localtime(time.time()))
    table_name = "douyu_{}".format(time_string)

    sql_create = "CREATE TABLE live_barrage.`douyu_{}` (" \
                 "`bid` int(11) NOT NULL AUTO_INCREMENT COMMENT '弹幕ID(barrage id)'," \
                 "`type` varchar(60) DEFAULT NULL COMMENT '弹幕类型'," \
                 "`fetch_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '弹幕抓取时间(约等于弹幕发布时间)'," \
                 " `user_name` varchar(40) DEFAULT NULL COMMENT '弹幕发布者名称'," \
                 " `user_level` int(11) DEFAULT NULL COMMENT '弹幕发布者等级'," \
                 " `content` varchar(100) DEFAULT NULL COMMENT '弹幕内容'," \
                 " `text` varchar(100) DEFAULT NULL COMMENT '弹幕其他信息'," \
                 " PRIMARY KEY (`bid`)" \
                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='斗鱼弹幕({})';"
    mysql.create(sql_create.format(time_string, live_name))

    print("开始抓取斗鱼直播弹幕.....")

    total_time = 0
    total_num = 0
    # screenshot = 0

    barrage_id_list = list()

    data_id_max = 0
    for num in range(int(36000 / 0.5)):

        start_time = time.time()

        label_html = browser.find_element_by_id(
            "js-barrage-list").get_attribute("innerHTML")
        soup = BeautifulSoup(label_html, 'lxml')  # 将网页内容解析为Soup对象

        barrage_list = []
        for label in soup.select("li"):

            bid = str(label["id"])  # 提取:弹幕ID

            if bid in barrage_id_list:
                continue
            barrage_id_list.append(bid)

            if len(barrage_id_list) > 200:
                barrage_id_list.remove(barrage_id_list[0])

            barrage_info = {
                "type": "",  # 弹幕所属类型
                "user_name": "",  # 弹幕发布者名称
                "user_level": 0,  # 弹幕发布者等级
                "content": "",  # 弹幕内容
                "text": ""  # 其他信息
            }

            type_class = label.select_one("li > div")["class"]
            if "Barrage-notice" in type_class and "normalBarrage" not in type_class:
                barrage_info["type"] = "NOTICE"
            elif "normalBarrage" in type_class:
                barrage_info["type"] = "NORMAL"
            elif "Barrage-userEnter" in type_class:
                barrage_info["type"] = "ENTER"
            elif "Barrage-message" in type_class:
                barrage_info["type"] = "MESSAGE"

            for info_label in label.select("li > div > span"):
                info_label_class = info_label["class"]
                if "UserLevel" in info_label_class:
                    barrage_info["user_level"] = re.search(
                        "[0-9]+", info_label["title"]).group()
                elif "Barrage-nickName" in info_label_class:
                    barrage_info["user_name"] = info_label.text.replace(
                        " ", "")
                elif "Barrage-content" in info_label_class:
                    barrage_info["content"] = info_label.text.replace(" ", "")
                elif "Barrage-text" in info_label_class:
                    barrage_info["text"] = info_label.text.replace(" ", "")

            barrage_list.append(barrage_info)

        if len(barrage_list) < 200:

            mysql.insert(table_name, barrage_list)

            total_num += 1
            total_time += 1000 * (time.time() - start_time)

            print("本次时间范围内新增弹幕:", len(barrage_list), "条,", "(共计:", data_id_max,
                  ")", "|", "运行时间:", round(total_time / total_num), "毫秒", "(",
                  round(total_time), "/", total_num, ")")

        else:

            total_num += 1
            total_time += 1000 * (time.time() - start_time)

            print("本次时间范围内弹幕列表未自动向下滚动...")

        wait_time = 0.5
        if wait_time > (time.time() - start_time):
            time.sleep(0.5 - (time.time() - start_time))

        data_id_max += len(barrage_list)
예제 #20
0
                                        "[0-9]+", feedback_item):
                                    item["retweets"] = int(pattern.group())
                            if "喜欢" in feedback_item:
                                if pattern := re.search(
                                        "[0-9]+", feedback_item):
                                    item["likes"] = int(pattern.group())

                item_list.append(item)

            # 向下滚动到最下面的一条推文
            if last_label_tweet is not None:
                self.driver.execute_script("arguments[0].scrollIntoView();",
                                           last_label_tweet)  # 滑动到推文标签
                self.console("执行一次向下翻页...")
                time.sleep(3)
            else:
                break

        return item_list


# ------------------- 单元测试 -------------------
if __name__ == "__main__":
    driver = Chrome(cache_path=r"E:\Temp")
    print(
        SpiderTwitterAccountPost(driver).running(
            user_name=SpiderTwitterAccountPost.get_twitter_user_name(
                "https://twitter.com/zaobaosg"),
            since_date=dt.date(2020, 10, 1),
            until_date=dt.date(2020, 10, 7)))