Beispiel #1
0
def crawler():
    data_date = tool.file.load_as_json(env.PATH["WanPlus"]["Date File"])  # 载入日期比赛表
    data_race = tool.file.load_as_json(env.PATH["WanPlus"]["Race File"])  # 载入比赛包含场次表

    # 统计需要抓取的比赛ID列表
    need_race_id_list = list()
    for date_name, date_race_list in data_date.items():
        for race_item in date_race_list:
            if race_item["race_id"] not in data_race:
                need_race_id_list.append(race_item["race_id"])
    print("需要抓取的比赛数量:", len(need_race_id_list))

    # 抓取需要的比赛数据
    for i in range(len(need_race_id_list)):
        need_race_id = str(need_race_id_list[i])
        print("正在抓取比赛:", i + 1, "/", len(need_race_id_list), "(", need_race_id, ")")
        match_id_list = list()  # 场次ID列表
        response = requests.get(race_list_url % need_race_id, headers=race_list_headers)
        bs = BeautifulSoup(response.content.decode(), 'lxml')
        game_labels = bs.select("body > div > div.content > div.left > div:nth-child(1) > div > a")
        for game_label in game_labels:
            if game_label.has_attr("data-matchid"):
                match_id_list.append(game_label["data-matchid"])
        data_race[need_race_id] = match_id_list
        tool.file.write_json(env.PATH["WanPlus"]["Race File"], data_race)  # 存储日期比赛表
        time.sleep(tool.get_scope_random(5))
Beispiel #2
0
def crawler():
    data_race = tool.file.load_as_json(
        env.PATH["WanPlus"]["Race File"])  # 载入比赛包含场次表
    data_list_match = os.listdir(
        env.PATH["WanPlus"]["Match Path"])  # 载入游戏信息文件列表

    # 统计需要抓取的场次ID列表
    need_match_id_list = dict()
    for race_id, match_id_list in data_race.items():
        for match_id in match_id_list:
            match_file_name = str(match_id) + ".json"
            if match_file_name not in data_list_match:
                need_match_id_list[match_id] = race_id
    print("需要抓取的场次数量:", len(need_match_id_list))

    num = 1
    for match_id, race_id in need_match_id_list.items():
        print("正在抓取场次:", num, "/", len(need_match_id_list), "(", match_id, "-",
              race_id, ")")
        num += 1
        # 执行场次请求
        actual_url = match_list_url % match_id
        match_list_headers["referer"] = match_list_referer % race_id
        response = requests.get(actual_url, headers=match_list_headers)
        response_json = json.loads(response.content.decode())
        tool.file.write_json(
            os.path.join(env.PATH["WanPlus"]["Match Path"],
                         str(match_id) + ".json"), response_json)
        time.sleep(tool.get_scope_random(5))
Beispiel #3
0
def crawler(driver, user_name: str, template):
    """
    抓取Twitter用户信息
    填写数据模板中的name、username、birthday、biography、website、profile_photo、likes_count、tweets_count、followers_count、following_count属性

    :param driver: <selenium.webdriver.chrome.webdriver.WebDriver> Chrome浏览器对象
    :param user_name: <str> Twitter用户名
    :param template: <dict> 返回值数据模板
    :return: <dict> 填写抓取数据的数据模板
    """
    # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误)
    try:
        profile = Profile(user_name).to_dict()
    except:
        print("账号不存在!")
        return

    print(profile)

    for key, value in profile.items():
        template[key] = value

    # 抓取账户粉丝数和正在关注数(Selenium爬虫)
    driver.get("https://twitter.com/" + user_name)
    time.sleep(tool.get_scope_random(12))
    try:
        following_count = tool.fetch.number(
            driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[0]).text)
        followers_count = tool.fetch.number(
            driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[0]).text)
    except:
        try:
            following_count = tool.fetch.number(
                driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[1]).text)
            followers_count = tool.fetch.number(
                driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[1]).text)
        except:
            print("Selenium抓取关注数+正在关注失败!")
            return template

    # 依据Selenium爬虫结果修正抓取结果
    if abs(template["following_count"] - following_count) > 1000:
        print("修正正在关注数量:", template["following_count"], "→", following_count)
        template["following_count"] = following_count
    if abs(template["followers_count"] - followers_count) > 1000:
        print("修正关注者数量:", template["followers_count"], "→", followers_count)
        template["followers_count"] = followers_count

    return template
Beispiel #4
0

if __name__ == "__main__":
    selenium = tool.open_chrome()  # 打开Selenium控制的Chrome浏览器
    mySQL = tool.mysql_connect("Huabang")  # 构造MySQL数据库连接对象

    if "Huabang" in env.DATA and "Media List" in env.DATA["Huabang"]:
        for media_item in env.DATA["Huabang"]["Media List"]:
            # if media_item[0] < 440:
            #     continue
            print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-", media_item[3], "(", media_item[2], ")")
            tweet_template = {
                "media_id": media_item[0],
                "media_name": media_item[1],
                "tweet_id": None,
                "is_retweet": 0,
                "time": None,
                "text": None,
                "replies": None,
                "retweets": None,
                "likes": None
            }
            tweets = crawler(selenium, media_item[2], tweet_template,
                             since=dt.date(2020, 8, 6), until=dt.date(2020, 8, 8))
            print("共抓取推文:", len(tweets))
            record_num = mySQL.insert("twitter_tweet_2008", tweets)
            print("写入记录数:", record_num)
            time.sleep(tool.get_scope_random(1))
    else:
        print("榜单媒体名录不存在")