def running(self, pcode, pykm): """采集指定期刊的刊期列表 :param pcode: <str> 中国知网期刊所属数据库 :param pykm: <str> 中国知网期刊ID(主要ID) :return: <list:dict> 期刊刊期列表 """ issue_list = [] ajax_url = "http://navi.cnki.net/knavi/JournalDetail/GetJournalYearList?pcode={}&pykm={}&pIdx=0".format( pcode, pykm) if html_text := tool.do_request(ajax_url).content.decode( errors="ignore"): # 请求获取期刊刊期列表Ajax bs = BeautifulSoup(html_text, "lxml") # 将期刊刊期列表Ajax转换为BeautifulSoup对象 for journal_label in bs.select( "#page1 > div > dl > dd > a"): # 定位到各个刊期的标签 if match := re.search( "[0-9]{6}", journal_label["id"]): # 提取刊期名称,格式例如:yq201908 journal_name = match.group() issue_list.append({ "year": journal_name[0:4], "issue": journal_name[4:] })
def running(self, page_url) -> List[Dict]: # 获取视频基本信息 response = tool.do_request(page_url, headers=self._HEADERS).text title = re.search(r"(?<=<title>)[^<]+(?=</title>)", response).group() # 提取视频标题 video_id = re.search(r"(?<=\"vid\":\")\d+(?=\",)", response).group() # 提取视频ID resource_id = re.search(r"(?<=\"ac\":\")\d+(?=\",)", response).group() # 提取视频资源ID # 获取视频下载Url rep_info = tool.do_request(self._DOWNLOAD_URL.format(video_id, resource_id), headers=self._HEADERS) video_url = rep_info.json()["playInfo"]["streams"][0]["playUrls"][0] # 返回结果 return [{ "Title": title, "Download Url": video_url }]
def running(self, start_date, end_date) -> List[Dict]: result = [] # 统计需要抓取的日期列表 all_date_list = list() # 需要获取的日期列表 curr_date = end_date while curr_date >= start_date: all_date_list.append(curr_date.strftime("%Y%m%d")) curr_date += datetime.timedelta(days=-1) # print("需要抓取的日期总数:", len(all_date_list), all_date_list) if len(all_date_list) == 0: # 若没有需要抓取的日期则结束运行 return result # 统计需要追溯的日期所在的周的时间戳 need_date_list = list() # 需要抓取的时间戳列表(所有的周日+最早的一天) for curr_date_str in all_date_list: curr_date = datetime.datetime.strptime(curr_date_str, "%Y%m%d") if curr_date.weekday() == 0: # 若时间戳为周日 need_date_list.append(curr_date_str) need_date_list.append(all_date_list[-1]) # 添加最早的一天 # print("需要抓取的时间戳总数:", len(need_date_list)) # 格式化日期,方便比较 end_date = end_date.strftime("%Y%m%d") start_date = start_date.strftime("%Y%m%d") # 依据时间戳抓取比赛数据 for i in range(len(need_date_list)): curr_date_str = need_date_list[i] # print("正在抓取时间戳:", i + 1, "/", len(need_date_list), "(", curr_date_str, ")") curr_date_timestamp = str((datetime.datetime.strptime(curr_date_str, "%Y%m%d") - datetime.datetime(1970, 1, 1)).total_seconds()) self._DATE_LIST_DATA["time"] = curr_date_timestamp # 列表请求的表单数据 response = tool.do_request(self._DATE_LIST_URL, method="post", headers=self._DATE_LIST_HEADERS, data=self._DATE_LIST_DATA) # print(response.content) if response.status_code == 200: response_json = json.loads(response.content.decode()) # print(response_json) for curr_date_str, date_info in response_json["data"]["scheduleList"].items(): if int(end_date) >= int(curr_date_str) >= int(start_date): if date_info["list"]: for match in date_info["list"]: result.append({ "date": curr_date_str, "race_id": match["scheduleid"], "team_a_name": match["oneseedname"], "team_b_name": match["twoseedname"], "start_time": match["starttime"], "team_a_score": match["onewin"], "team_b_score": match["twowin"], "contest_name": match["ename"], "match_name": match["groupname"], "team_a_score_per": match["oneScore"], "team_b_score_per": match["twoScore"], }) time.sleep(5) return result
def running(self, race_id): match_id_list = [] response = tool.do_request(self._RACE_LIST_URL % race_id, headers=self._RACE_LIST_HEADERS) bs = BeautifulSoup(response.content.decode(), 'lxml') game_labels = bs.select( "body > div > div.content > div.left > div:nth-child(1) > div > a") for game_label in game_labels: if game_label.has_attr("data-matchid"): match_id_list.append(game_label["data-matchid"]) return [{"race_id": race_id, "match_id_list": match_id_list}]
def spider(): # 连接到MySQL数据库 mysql = tool.db.MySQL(host=MYSQL_HOST, database=MYSQL_DATABASE, user=MYSQL_USER, password=MYSQL_PASSWORD) # 遍历所有年份 for year, page_num in YEAR_PAGE_NUM_LIST: # 遍历年份的所有页面 for page in range(1, page_num + 1): print("当前采集:", year, "-", page) data_list = [] # 定义请求的Url参数 url_params = { "t": 3, # t=3 : 已经结束的赛事 "year": year, "page": page } # 执行请求 response = tool.do_request(EVENT_LIST_PAGE_URL, params=url_params) # 解析返回结果 lxml = BeautifulSoup(response.text, "lxml") # 遍历所有赛事的外层标签 for label in lxml.select( "#info > div.left-slide > div.left-box > div.event-list > ul > li" ): wanplus_event_id = re.search( "[0-9]+", label.select_one("li > a")["href"]).group() # 玩加电竞赛事ID event_name = label.select_one("li > a > span").text # 赛事名称 time_frame = label.select_one( "li > a > p:nth-child(3)").text # 时间范围 start_date, end_date = time_frame.split(" — ") # 赛事开始时间、赛事结束时间 data_list.append({ "event_name": event_name, "start_date": start_date, "end_date": end_date, "wanplus_event_id": wanplus_event_id }) # 将数据写入到MySQL数据库 mysql.insert("event", data_list) time.sleep(3)
def running(self, race_id, match_id): # 执行场次请求 self._MATCH_LIST_HEADERS[ "referer"] = self._MATCH_LIST_REFERER % race_id response = tool.do_request(self._MATCH_LIST_URL % match_id, headers=self._MATCH_LIST_HEADERS) return [{ "race_id": race_id, "match_id": match_id, "match_detail": json.loads(response.content.decode()) }]
def running(self): response = tool.do_request("https://www.qidian.com/rank/collect?style=2&chn=-1&page=1", headers=self._HEADERS) # 解析答案 bs = BeautifulSoup(response.content.decode(), "lxml") book_list = [] for book_label in bs.select("#rank-view-list > div > table > tbody > tr"): book_list.append({ "book_name": book_label.select_one("tr > td:nth-child(2)").text.replace("「", "").replace("」", "") }) return book_list
def running(self, schedule_id, match_id): # ----- 执行请求 ----- response = tool.do_request(self._MATCH_URL % str(match_id), headers=self._MATCH_HEADERS) response_text = response.content.decode() # 如果请求结果为空,或请求结果不是Json格式数据,则直接返回 if not response_text or response_text[0] == "<": return response_json = json.loads(response_text) # 如果请求失败(来历不明的请求),则直接返回 if response_json["ret"] == -400: return # ----- 解析返回结果 ----- return response_text
def running(self): # 执行网页请求 response = tool.do_request("https://s.weibo.com/top/summary", headers=self._HEADERS) # 请求微博热搜榜 bs = BeautifulSoup(response.content.decode(errors="ignore"), "lxml") # 解析网页 hot_list = [] empty_rank = 0 # 统计空热搜(广告热搜)数量 for label_item in bs.select("#pl_top_realtimehot > table > tbody > tr"): # 遍历热搜的标签 # 提取热搜排名 if label_rank := label_item.select_one("tr > td.td-01"): if len(label_rank.text) == 0: continue if match_rank := re.search("[0-9]+", label_rank.text): ranking = int(match_rank.group()) - empty_rank else: tool.console("报错", "提取的热搜排名不包含数字!") continue
def running(self, journal, pcode, pykm, year, issue): """采集指定刊期的论文列表 :param journal: <str> 期刊名称 :param pcode: <str> 中国知网期刊所属数据库 :param pykm: <str> 中国知网期刊ID(主要ID) :param year: <int> 刊期所属年份 :param issue: <int> 刊期在年份中的序号 :return: <list:dict> 指定刊期的期刊论文列表 """ ajax_url = "http://navi.cnki.net/knavi/JournalDetail/GetArticleList?year={}&issue={}&pykm={}&pageIdx=0&pcode={}" ajax_url = ajax_url.format(year, issue, pykm, pcode) response = tool.do_request(ajax_url) # 请求获取期刊论文列表Ajax html_text = response.content.decode(errors="ignore") if html_text is None: return None bs = BeautifulSoup(html_text, "lxml") # 将期刊论文列表Ajax转换为BeautifulSoup对象 article_list = [] # 返回的期刊论文列表 now_column = None # 当前所处的栏目 label_wrapper = bs.select_one("html > body") if label_wrapper is None: return None for article_label in bs.select_one( "html > body").children: # 循环处理论文及所处栏目 if article_label.name == "dt": now_column = article_label.get_text() elif article_label.name == "dd": title = re.sub( r"\s", "", article_label.select_one("dd > span > a").text) # 读取论文标题 href = article_label.select_one("dd > span > a")[ "href"] # 读取论文链接 if match := re.search("(?<=dbCode=)[^&]+(?=&)", href): db_code = match.group() # 在论文链接中提取变量值 else: continue if match := re.search("(?<=filename=)[^&]+(?=&)", href): file_name = match.group() # 在论文链接中提取变量值 else: continue if match := re.search("(?<=tableName=)[^&]+(?=&)", href): db_name = match.group() # 在论文链接中提取变量值
def running(self, mid): self.headers["referer"] = "https://space.bilibili.com/%s/video".format( str(mid)) now_page = 1 max_page = 2 video_list = [] while now_page <= max_page: print("正在请求第", now_page, "页......") # 生成执行请求的参数 param_dict = { "mid": mid, "ps": 30, "tid": 0, "pn": now_page, # 将当前页填入到参数列表中 "keyword": "", "order": "pubdate", "jsonp": "jsonp", } response = tool.do_request( "https://api.bilibili.com/x/space/arc/search?" + urlencode(param_dict), headers=self._HEADERS) response_json = response.json() # 将返回结果解析为Json格式 now_page += 1 # 页面累加 max_page = math.ceil(response_json["data"]["page"]["count"] / 30) # 获取UP主视频总数(用以控制翻页次数) for video_item in response_json["data"]["list"]["vlist"]: # 遍历视频信息 video_list.append({ "video_title": video_item["title"], # 视频标题 "video_play": video_item["play"] # 播放次数 }) time.sleep(5) return video_list
def running(self, params_item): response = tool.do_request( url= "https://qt.qq.com/lua/mlol_battle_info/get_total_tier_rank_list", params={ "area_id": str(params_item[0]), "offset": str(params_item[1]), "sign": str(params_item[2]) }, verify=False) summoner_json = response.json() if "data" not in summoner_json or "player_list" not in summoner_json[ "data"]: return None summoner_list = [] for summoner_item in summoner_json["data"]["player_list"]: if "tier_title" not in summoner_item: continue if "name" not in summoner_item: continue if "uuid" not in summoner_item: continue if "ranking" not in summoner_item: continue if "league_points" not in summoner_item: continue summoner_list.append({ "tier": summoner_item["tier_title"], "name": summoner_item["name"], "uuid": summoner_item["uuid"], "area": 1, "ranking": summoner_item["ranking"], "points": summoner_item["league_points"] }) return summoner_list
def running(self, skey, exploit_id, end_time, user_id): # 请求召唤师比赛记录 response = tool.do_request( url= "https://mlol.qt.qq.com/gorpc/exploit/exploit/query_exploit_detail/proxy", params={ "exploit_id": exploit_id, "game_area": "1", "scene": "tft_mlol", "user_id": user_id, "endtime": end_time, "plat": "android", "version": "9914" }, headers=self._HEADERS, cookies={ "l_uin": "o12345678", # o+QQ号 "p_uin": "o12345678", # o+QQ号 "uin": "o12345678", # o+QQ号 "skey": skey }, # 登录状态SKEY(需要通过模拟器+Fildder获得,有效期24小时左右) verify=False) exploit = response.json() if "info" not in exploit: print("Missing: info") return False if "exploit_detail" not in exploit["info"]: print("Missing: info - exploit_detail") return False if exploit["info"]["exploit_detail"] is None: print("Missing: info - exploit_detail - is None") return False if "exploit_id" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - exploit_id") return False if "end_time" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - end_time") return False if "game_match_type" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - game_match_type") return False if "duration" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - duration") return False if "specific_user_exploit" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - specific_user_exploit") return False if "buff_version" not in exploit["info"]["exploit_detail"][ "specific_user_exploit"]: print( "Missing: info - exploit_detail - specific_user_exploit - buff_version" ) return False if "member_exploit_list" not in exploit["info"]["exploit_detail"]: print("Missing: info - exploit_detail - member_exploit_list") return False member_list = [{} for _ in range(8)] member_info_list = [[] for _ in range(8)] for member_exploit in exploit["info"]["exploit_detail"][ "member_exploit_list"]: if "user_id" not in member_exploit: print( "Missing: info - exploit_detail - member_exploit_list - - user_id" ) return False if "nickname" not in member_exploit: print( "Missing: info - exploit_detail - member_exploit_list - - nickname" ) return False if "ranking" not in member_exploit: print( "Missing: info - exploit_detail - member_exploit_list - - ranking" ) return False if "piece_list" not in member_exploit: print( "Missing: info - exploit_detail - member_exploit_list - - piece_list" ) return False if "game_rank_list" not in member_exploit: print( "Missing: info - exploit_detail - member_exploit_list - - game_rank_list" ) return False if "full_rank_title" not in member_exploit["game_rank_list"][0]: print( "Missing: info - exploit_detail - member_exploit_list - - game_rank_list - full_rank_title" ) return False ranking = int(member_exploit["ranking"]) if ranking < 1 or ranking > 8: return False member_list[ranking - 1] = { "user_id": member_exploit["user_id"], "nickname": member_exploit["nickname"], "rank_title": member_exploit["game_rank_list"][0]["full_rank_title"] } if member_exploit["piece_list"] is None: print( "Missing: info - exploit_detail - member_exploit_list - - piece_list - is None" ) continue for piece in member_exploit["piece_list"]: if "star_num" not in piece: print( "Missing: info - exploit_detail - member_exploit_list - - piece_list - - star_num" ) return False if "hero_name" not in piece: print( "Missing: info - exploit_detail - member_exploit_list - - piece_list - - hero_name" ) return False member_info_list[ranking - 1].append( [piece["hero_name"], piece["star_num"]]) exploit_write = list() exploit_write.append({ "exploit_id": exploit["info"]["exploit_detail"]["exploit_id"], "end_time": exploit["info"]["exploit_detail"]["end_time"], "game_match_type": exploit["info"]["exploit_detail"]["game_match_type"], "duration": exploit["info"]["exploit_detail"]["duration"], "version": exploit["info"]["exploit_detail"]["specific_user_exploit"] ["buff_version"], "member_list": json.dumps(member_list, ensure_ascii=False), "member_1": json.dumps(member_info_list[0], ensure_ascii=False), "member_2": json.dumps(member_info_list[1], ensure_ascii=False), "member_3": json.dumps(member_info_list[2], ensure_ascii=False), "member_4": json.dumps(member_info_list[3], ensure_ascii=False), "member_5": json.dumps(member_info_list[4], ensure_ascii=False), "member_6": json.dumps(member_info_list[5], ensure_ascii=False), "member_7": json.dumps(member_info_list[6], ensure_ascii=False), "member_8": json.dumps(member_info_list[7], ensure_ascii=False) }) return exploit_write
def running(self, skey, summoner, start_time, end_time) -> List[Dict]: params = { "user_id": summoner, "scene": "tft_mlol", "plat": "android", "version": "9914", "game_area": "1", "login_account_type": "1" } next_baton = None exploit_list = [] # 执行请求 for i in range(100): print("执行第", i + 1, "次请求...") if next_baton: params["baton"] = next_baton # 请求召唤师比赛记录列表(需要cookies,p_skey可以不需要,可以使用bacon参数控制翻页) response = tool.do_request( url= "https://mlol.qt.qq.com/gorpc/exploit/exploit/query_player_exploit_list/proxy", params=params, headers=self._HEADERS, cookies={ "l_uin": "o13578660", "p_uin": "o13578660", "uin": "o13578660", "skey": skey }, verify=False) exploit_json = response.json() if "info" not in exploit_json: break if "exploit_list" not in exploit_json["info"]: break if "next_baton" not in exploit_json["info"]: break next_baton = exploit_json["info"]["next_baton"] if exploit_json["info"]["exploit_list"] is None: break for exploit_item in exploit_json["info"]["exploit_list"]: if "exploit_id" not in exploit_item: continue if "end_time" not in exploit_item: continue if "game_match_type" not in exploit_item: continue if "specific_user_exploit" not in exploit_item: continue if "user_id" not in exploit_item["specific_user_exploit"]: continue if exploit_item["end_time"] < start_time: break if exploit_item["end_time"] < end_time: exploit_list.append({ "exploit_id": exploit_item["exploit_id"], "end_time": exploit_item["end_time"], "user_id": exploit_item["specific_user_exploit"]["user_id"], "game_match_type": exploit_item["game_match_type"] }) return exploit_list
def spider(): # 连接到MySQL数据库 mysql = tool.db.MySQL(host=MYSQL_HOST, database=MYSQL_DATABASE, user=MYSQL_USER, password=MYSQL_PASSWORD) # 从数据库中读取赛事列表 event_list = mysql.select( "event", columns=["event_id", "event_name", "wanplus_event_id"], where="event_id>=251") # 遍历所有赛事 for event_id, event_name, wanplus_event_id in event_list: print("当前采集:", event_id, "-", event_name, "-", wanplus_event_id) # ----- 采集赛事的赛段列表 ----- # 执行请求 response = tool.do_request(EVENT_INFO_URL.format(wanplus_event_id)) time.sleep(5) # 解析返回结果 lxml = BeautifulSoup(response.text, "lxml") # 提取所有赛段编号 stage_id_list = [] stage_name_list = [] for label in lxml.select("#event_stage > li"): stage_id_list.append(int(label["data-stageid"])) stage_name_list.append(label.text.replace("\n", "").lstrip()) # 将赛段信息写入数据库 data_list_1 = [] for i in range(len(stage_id_list)): data_list_1.append({ "event_id": event_id, "stage_name": stage_name_list[i], "wanplus_stage_id": stage_id_list[i] }) mysql.insert("stage", data_list_1) # ----- 遍历各个赛段中包含的比赛 ----- for stage_id in stage_id_list: print("当前采集赛段:", stage_id) data_list_2 = [] # 定义请求的Url参数 url_params = { "_gtk": 868258461, # 可视作常量 "eId": wanplus_event_id, "stageId": stage_id, "gameType": 2 # 可视作常量 } # 执行请求 response = tool.do_request(STAGE_INFO_URL, method="POST", data=url_params) time.sleep(3) # 解析返回结果 lxml = BeautifulSoup(response.text, "lxml") # 提取赛段中比赛的编号 for label in lxml.select("a"): wanplus_schedule_id = int( re.search("[0-9]+", label["href"]).group()) data_list_2.append({ "event_id": event_id, "stage_id": stage_id, "wanplus_schedule_id": wanplus_schedule_id }) # 将数据写入到MySQL数据库 mysql.insert("schedule", data_list_2)
def running(self): movie_list = [] for page_num in range(10): url = "https://movie.douban.com/top250?start={0}&filter=".format( page_num * 25) response = tool.do_request(url, headers=self._HEADERS) bs = BeautifulSoup(response.content.decode(errors="ignore"), 'lxml') for movie_label in bs.select( "#content > div > div.article > ol > li"): # 定位到电影标签 # 获取电影链接(<a>标签的href属性) url = movie_label.select_one("li > div > div.pic > a")["href"] # 解析标题行 title_text = movie_label.select_one( "li > div > div.info > div.hd > a").text.replace( "\n", "") # 提取标题行内容+清除换行符 title_chinese = title_text.split( "/")[0].strip() # 提取中文标题+清除前后空格 title_other = [ title.strip() for title in title_text.split("/")[1:] ] # 提取其他标题+清除前后空格 # 解析内容信息(因长度原因,大部分主演名字不全暂不解析) info_text = movie_label.select_one( "li > div > div.info > div.bd > p:nth-child(1)" ).text # 获取说明部分内容 info_text = re.sub("\n *", "\n", info_text) # 清除行前多余的空格 info_text = re.sub("^\n", "", info_text) # 清除开头的空行 info_line_1, info_line_2 = info_text.split("\n")[ 0:2] # 获取第1行内容信息:包括导演和主演、获取第2行内容信息:包括年份、国家和类型 director = re.sub(" *(主演|主\\.{3}|\\.{3}).*$", "", info_line_1) # 仅保留导演部分 year = int( re.search( "[0-9]+", info_line_2.split("/")[0]).group()) # 提取电影年份并转换为数字 country = info_line_2.split("/")[1].strip() if len( info_line_2.split("/")) >= 2 else None # 提取电影国家 classify = info_line_2.split("/")[2].strip() if len( info_line_2.split("/")) >= 3 else None # 提取电影类型 classify = re.split(" +", classify) # 将电影类型转换为list形式 # 解析评分 rating_num = movie_label.select_one( "li > div > div.info > div.bd > div > span.rating_num" ).text # 提取评分 rating_num = float(re.search( "[0-9.]+", rating_num).group()) # 将评分转换为浮点型数字 # 解析评分人数 rating_people = movie_label.select_one( "li > div > div.info > div.bd > div > span:nth-child(4)" ).text # 提取评分人数 rating_people = int( re.search("[0-9]+", rating_people).group()) # 将评分人数转换为数字 # 解析评价(该标签可能会不存在) if quote_label := movie_label.select_one( "li > div > div.info > div.bd > p.quote"): quote = quote_label.text.replace("\n", "") # 提取评价+清除换行符 else: quote = None movie_list.append({ "url": url, "title_chinese": title_chinese, "title_others": title_other, "director": director, "year": year, "country": country, "classify": classify, "rating_num": rating_num, "rating_people": rating_people, "quote": quote }) time.sleep(5)
def running(self, start_date, end_date) -> List[Dict]: # 初始化返回结果数据 result = [] # ----- 计算所有需要抓取的日期列表 ----- all_date_list = [] # 需要获取的日期列表 curr_date = end_date while curr_date >= start_date: all_date_list.append(curr_date.strftime("%Y%m%d")) curr_date += timedelta(days=-1) # 若没有需要抓取的日期则完成采集 if len(all_date_list) == 0: return result # ----- 计算实际需要请求的日期列表 ----- # 每一次请求均会返回该日所在周7天的数据,因此只需要请求每周的周日即可,和最后一个还没有到周日的周即可 need_date_list = [] # 如果最后一周还没有到周日,则添加最后一周的最后一天 if datetime.strptime(all_date_list[0], "%Y%m%d").weekday() != 0: need_date_list.append(all_date_list[0]) # 添加之前每一周的周日 for curr_date in all_date_list: if datetime.strptime(curr_date, "%Y%m%d").weekday() == 0: need_date_list.append(curr_date) # ----- 依据时间戳抓取比赛数据 ----- for i in range(len(need_date_list)): print("当前抓取:", self.format_date(need_date_list[i]), "(", i + 1, "/", len(need_date_list), ")") curr_date = need_date_list[i] # 计算请求参数并执行请求 curr_date_timestamp = str((datetime.strptime(curr_date, "%Y%m%d") - datetime(1970, 1, 1)).total_seconds()) self._DATE_LIST_DATA["time"] = curr_date_timestamp response = tool.do_request(self._DATE_LIST_URL, method="post", headers=self._DATE_LIST_HEADERS, data=self._DATE_LIST_DATA) if response.status_code != 200: print("请求失败!") continue # 解析请求的返回结果 response_json = json.loads(response.content.decode()) for curr_date, date_info in response_json["data"][ "scheduleList"].items(): # 遍历该周的每一天 print("当前抓取日期:", self.format_date(curr_date)) if int(start_date.strftime("%Y%m%d")) <= int(curr_date) <= int( end_date.strftime("%Y%m%d")): if date_info["list"]: # 检查当日是否有比赛 for match in date_info["list"]: # 遍历该日的每一场比赛 result.append({ "schedule_id": int(match["scheduleid"] ), # 比赛ID(一场完整的BO1/BO3/BO5的比赛称为比赛) "date": self.format_date(curr_date), # 比赛日期 "time": match["starttime"], # 比赛时间:比赛开始时间 "event_id": int(match["eid"]), # 赛事ID "event_name": match["ename"], # 赛事名称 "event_group_name": match["groupname"], # 赛事赛段 "stage_id": int(match["stageid"]), # 疑似赛事ID "bo_num": int(match["bonum"]), # 比赛场次:BO1=1,BO3=3,BO5=5 "team_a_id": int(match["oneseedid"]), # 队伍A的ID "team_a_name": match["oneseedname"], # 队伍A的名称 "team_b_id": int(match["twoseedid"]), # 队伍B的ID "team_b_name": match["twoseedname"], # 队伍B的名称 "team_a_win": int(match["onewin"]), # 队伍A的获胜小场数 "team_b_win": int(match["twowin"]), # 队伍B的获胜小场数 "team_a_score": str(match["oneScore"]), # 队伍A的每小场得分 "team_b_score": str(match["twoScore"]), # 队伍B的每小场得分 }) # 执行延迟 time.sleep(5) return result
def running(self, schedule_id: int): result = [] # 初始化返回结果数据 # ----- 计算请求参数并执行请求 ----- response = tool.do_request(self._RACE_LIST_URL % str(schedule_id), headers=self._RACE_LIST_HEADERS) if response.status_code != 200: print("请求失败!") return result bs = BeautifulSoup(response.content.decode(), "lxml") # ----- 判断返回结果是否有效 ----- label = bs.select_one("body") # 判断比赛是否为未进行的状态:如果是未进行的状态则跳出 if label.has_attr("class") and "matchbf" in label["class"]: return # 判断是否为页面未找到的状态:如果是未找到的状态则跳出 if label.has_attr("class") and "mess_html" in label["class"]: return # ----- 解析请求比赛基本信息 ----- # 解析赛事ID和赛事名称 selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > h1 > a" label = bs.select_one(selector) event_id = int(re.search("[0-9]+", label["href"]).group()) # 赛事ID event_name = label.text # 赛事名称 # 解析比赛时间 selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li:nth-child(2) > span.time" label = bs.select_one(selector) if " " in label.text: schedule_date, schedule_time = label.text.split(" ")[ 0:2] # 比赛日期、比赛时间 else: schedule_date, schedule_time = "", "" # 解析比赛双方队伍信息 selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li.team-left > a" label = bs.select_one(selector) team_a_id = int(re.search("[0-9]+", label["href"]).group()) # 队伍A的ID team_a_name = label.text.replace("\n", "") # 队伍A的名称 selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li.team-right.tr > a" label = bs.select_one(selector) team_b_id = int(re.search("[0-9]+", label["href"]).group()) # 队伍B的ID team_b_name = label.text.replace("\n", "") # 队伍B的名称 # 解析比赛比分、比赛赛制 selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li:nth-child(2) > p" marks = bs.select_one(selector).text.split(":") team_a_win, team_b_win = int(marks[0]), int(marks[1]) # 队伍A的小分、队伍B的小分 schedule_bo_num = max(team_a_win, team_b_win) # 比赛赛制 # ----- 解析请求比赛场次信息 ----- game_labels = bs.select( "body > div > div.content > div.left > div:nth-child(1) > div > a") for game_label in game_labels: if game_label.has_attr("data-matchid"): result.append({ "match_id": game_label["data-matchid"], # 场次ID "schedule_id": schedule_id, # 比赛ID "schedule_date": schedule_date, "schedule_time": schedule_time, "schedule_bo_num": schedule_bo_num, "event_id": event_id, "event_name": event_name, "team_a_id": team_a_id, "team_a_name": team_a_name, "team_b_id": team_b_id, "team_b_name": team_b_name, "team_a_win": team_a_win, "team_b_win": team_b_win }) # 输出数据结果 self.output(result)