def crawler(): data_date = tool.file.load_as_json(env.PATH["WanPlus"]["Date File"]) # 载入日期比赛表 data_race = tool.file.load_as_json(env.PATH["WanPlus"]["Race File"]) # 载入比赛包含场次表 # 统计需要抓取的比赛ID列表 need_race_id_list = list() for date_name, date_race_list in data_date.items(): for race_item in date_race_list: if race_item["race_id"] not in data_race: need_race_id_list.append(race_item["race_id"]) print("需要抓取的比赛数量:", len(need_race_id_list)) # 抓取需要的比赛数据 for i in range(len(need_race_id_list)): need_race_id = str(need_race_id_list[i]) print("正在抓取比赛:", i + 1, "/", len(need_race_id_list), "(", need_race_id, ")") match_id_list = list() # 场次ID列表 response = requests.get(race_list_url % need_race_id, headers=race_list_headers) bs = BeautifulSoup(response.content.decode(), 'lxml') game_labels = bs.select("body > div > div.content > div.left > div:nth-child(1) > div > a") for game_label in game_labels: if game_label.has_attr("data-matchid"): match_id_list.append(game_label["data-matchid"]) data_race[need_race_id] = match_id_list tool.file.write_json(env.PATH["WanPlus"]["Race File"], data_race) # 存储日期比赛表 time.sleep(tool.get_scope_random(5))
def crawler(): data_race = tool.file.load_as_json( env.PATH["WanPlus"]["Race File"]) # 载入比赛包含场次表 data_list_match = os.listdir( env.PATH["WanPlus"]["Match Path"]) # 载入游戏信息文件列表 # 统计需要抓取的场次ID列表 need_match_id_list = dict() for race_id, match_id_list in data_race.items(): for match_id in match_id_list: match_file_name = str(match_id) + ".json" if match_file_name not in data_list_match: need_match_id_list[match_id] = race_id print("需要抓取的场次数量:", len(need_match_id_list)) num = 1 for match_id, race_id in need_match_id_list.items(): print("正在抓取场次:", num, "/", len(need_match_id_list), "(", match_id, "-", race_id, ")") num += 1 # 执行场次请求 actual_url = match_list_url % match_id match_list_headers["referer"] = match_list_referer % race_id response = requests.get(actual_url, headers=match_list_headers) response_json = json.loads(response.content.decode()) tool.file.write_json( os.path.join(env.PATH["WanPlus"]["Match Path"], str(match_id) + ".json"), response_json) time.sleep(tool.get_scope_random(5))
def crawler(driver, user_name: str, template): """ 抓取Twitter用户信息 填写数据模板中的name、username、birthday、biography、website、profile_photo、likes_count、tweets_count、followers_count、following_count属性 :param driver: <selenium.webdriver.chrome.webdriver.WebDriver> Chrome浏览器对象 :param user_name: <str> Twitter用户名 :param template: <dict> 返回值数据模板 :return: <dict> 填写抓取数据的数据模板 """ # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误) try: profile = Profile(user_name).to_dict() except: print("账号不存在!") return print(profile) for key, value in profile.items(): template[key] = value # 抓取账户粉丝数和正在关注数(Selenium爬虫) driver.get("https://twitter.com/" + user_name) time.sleep(tool.get_scope_random(12)) try: following_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[0]).text) followers_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[0]).text) except: try: following_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[1]).text) followers_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[1]).text) except: print("Selenium抓取关注数+正在关注失败!") return template # 依据Selenium爬虫结果修正抓取结果 if abs(template["following_count"] - following_count) > 1000: print("修正正在关注数量:", template["following_count"], "→", following_count) template["following_count"] = following_count if abs(template["followers_count"] - followers_count) > 1000: print("修正关注者数量:", template["followers_count"], "→", followers_count) template["followers_count"] = followers_count return template
if __name__ == "__main__": selenium = tool.open_chrome() # 打开Selenium控制的Chrome浏览器 mySQL = tool.mysql_connect("Huabang") # 构造MySQL数据库连接对象 if "Huabang" in env.DATA and "Media List" in env.DATA["Huabang"]: for media_item in env.DATA["Huabang"]["Media List"]: # if media_item[0] < 440: # continue print("开始抓取媒体:", media_item[1], "(", media_item[0], ")", "-", media_item[3], "(", media_item[2], ")") tweet_template = { "media_id": media_item[0], "media_name": media_item[1], "tweet_id": None, "is_retweet": 0, "time": None, "text": None, "replies": None, "retweets": None, "likes": None } tweets = crawler(selenium, media_item[2], tweet_template, since=dt.date(2020, 8, 6), until=dt.date(2020, 8, 8)) print("共抓取推文:", len(tweets)) record_num = mySQL.insert("twitter_tweet_2008", tweets) print("写入记录数:", record_num) time.sleep(tool.get_scope_random(1)) else: print("榜单媒体名录不存在")