def getting_players_info(self, nodes): result = [] for node in nodes: request = parsing_functions.get_request(node.url) if request is None: return try: soup = bs(request.text, 'html.parser') content = soup.find( 'div', 'js-tournament-filter-content').tbody.findAll('tr') data = {} for item in content: name = item.find(attrs={ 'class': 'table-item__name' }).text.lstrip().rstrip() role = item.find(attrs={ 'data-label': 'Амплуа' }).text.lstrip().rstrip() birth = item.find(attrs={ 'data-label': 'ДР' }).text.lstrip().rstrip() growth = item.find(attrs={ 'data-label': 'Рост' }).text.lstrip().rstrip() weight = item.find(attrs={ 'data-label': 'Вес' }).text.lstrip().rstrip() player_url = self.site + item.find('a', 'table-item')['href'] request = parsing_functions.get_request(player_url) nationality = bs(request.text, 'html.parser').find('div', text='Гражданство:') \ .next_sibling.lstrip().rstrip() if request is not None else Exception data = { 'id': None, 'team_id': node.data['id'], 'name': name, 'nationality': nationality, 'role': role, 'birth': birth, 'growth': growth, 'weight': weight } except Exception as e: print('Ошибка парсинга игроков') print(e) return data
def parse_teams(self, nodes): for node in nodes: request = parsing_functions.get_request(node.url + 'teams') if request is None: return try: soup = bs(request.text, 'html.parser') content = soup.find_all('a', 'teams-item__link') for item in content: name = item.find( 'div', 'teams-item__name').get_text().lstrip().rstrip() city = item.find( 'div', 'teams-item__country').get_text().lstrip().rstrip() data = { 'id': None, 'tournament_id': node.data['id'], 'name': name, 'city': city } team = Node(self.ParsingTypes.team, data) team.url = self.site + item['href'].replace( 'result/', 'players/') team.parents.append(node) node.add_node(team) self.parsed_teams += 1 except Exception as e: print('Ошибка парсинга команд') print(e)
def GetMatchLinks(self, url): try: Request = parsing_functions.get_request(url) Soup = Bs(Request.text, 'html.parser') Items = Soup.find_all('td', 'stat-results__link') Links = [self.Site + item.a['href'] for item in Items] return Links except Exception as Err: raise Exception(Err)
def parse_matches(self, url, parents): request = parsing_functions.get_request(url) if request is None: return try: soup = bs4.BeautifulSoup(request.text, 'html.parser') content = soup.find('div', 'sport__table__tstat').find_all('tr') content.pop(0) match_nodes = [] for html_row in content: html_columns = html_row.find_all('td', 'sport__table__tstat__td') tour = html_columns[0].get_text().lstrip().rstrip() match_date = html_columns[1].get_text().lstrip().rstrip() teams = html_columns[3].find_all('a') home = teams[0].get_text().lstrip().rstrip() guest = teams[1].get_text().lstrip().rstrip() match_result = html_columns[4].a.get_text().lstrip().rstrip( ).split(' ') penalty_home, penalty_guest, home_result, guest_result = None, None, None, None home_result = match_result[0].split(':')[0] guest_result = match_result[0].split(':')[1] if len(match_result) == 2: if re.match(r"\d:\d", match_result[1]) is not None: #пенальти penalty_home = match_result[1].split(':')[0] penalty_guest = match_result[1].split(':')[1] data = { 'id': 0, 'home_team_id': 0, 'guest_team_id': 0, 'match_date': match_date, 'home_team': home, 'guest_team': guest, 'home_score': home_result, 'guest_score': guest_result, 'home_penalty_score': penalty_home, 'guest_penalty_score': penalty_guest } node = Node(self.ParsingTypes.match, data) node.parents.extend(parents) for parent in parents: parent.add_player_node(node) self.parsed_matches += 1 except Exception as e: print('Ошибка парсинга матчей') print(e)
def GetSeasons(url, proxy): try: Request = parsing_functions.get_request(url, proxy) Soup = Bs(Request.text, 'html.parser') ContentTag = Soup.find('div', 'js-tournament-header-year') if ContentTag: Items = ContentTag.find_all('option') if Items: Seasons = [item['value'] for item in Items] return Seasons except Exception as Err: raise Exception(Err)
def parse_tournaments(self, url): request = parsing_functions.get_request(url) if request is None: return try: soup = bs(request.text, 'html.parser') content = soup.find_all('div', 'mc-sport-tournament-list__item') if content is None: return for item in content: country = item.find( 'div', 'item__title').get_text().lstrip().rstrip() html_links = item.find_all(attrs={"data-type": "tournament"}) for html_link in html_links: # html_id = re.findall(r'/(\d+)', html_link['href'])[0] # html_type = re.findall(r'_\w+', html_link['href'])[0] t_name = html_link['data-title'].lstrip().rstrip() html_link.find('span', 'separator').extract() t_dates_html = html_link.findNext( 'div', 'item__dates _dates').findAll('span') t_start_date = datetime.strptime( t_dates_html[0].get_text().lstrip().rstrip(), "%d.%m.%Y") t_end_date = datetime.strptime( t_dates_html[1].get_text().lstrip().rstrip(), "%d.%m.%Y") # tournament_url = html_type + "/" + html_id + "/tournir/info.html" # teams_url = self.common_url + html_type + "/" + html_id + "/teams.html" # plays_url = html_type + "/" + html_id + "/calendar.html" # plays_group_url = html_type + "/" + html_id + "/calendar/group.html" # plays_playoff_url = html_type + "/" + html_id + "/calendar/playoff.html" # plays_preliminary_url = html_type + "/" + html_id + "/calendar/preliminary.html" data = { 'id': None, 'name': t_name, 'country': country, 'start_date': t_start_date, 'end_date': t_end_date } node = Node(self.ParsingTypes.tournament, data) node.url = self.site + html_link['href'] self.root.add_node(node) node.parents.append(self.root) self.parsed_tournaments += 1 except Exception as e: print('Ошибка парсинга турниров') print(e)
def GetTournamentLinks(self, url): try: Request = parsing_functions.get_request(url) Soup = Bs(Request.text, 'html.parser') Content = Soup.find('div', 'mc-sport-tournament-list') Items = Content.find_all('a') Links = [] for item in Items: TournamentLink = self.Site + item['href'] + 'calendar' MatchLinks = self.GetMatchLinks(TournamentLink) Links.append([TournamentLink, MatchLinks]) return Links except Exception as Err: raise Exception(Err)
def ParseLinks(self, season=None): try: self.log.info("Getting links for parsing.") Request = parsing_functions.get_request(self.StartUrl) Soup = Bs(Request.text, 'html.parser') ContentTag = Soup.find('div', 'js-tournament-header-year') if ContentTag: Items = [item['data-href'] for item in ContentTag.find_all('option')if item['value'] == season] if Items: Links = [] if season: Links = self.GetTournamentLinks(self.Site + Items[0]) return Links except Exception as Err: raise Exception(f"Error parsing season links: {Err}")
def parse_season(url): try: request = parsing_functions.get_request(url) if request is None: return soup = bs4.BeautifulSoup(request.text, 'html.parser') content = soup.find('div', 'js-tournament-header-year').find_all('option') if content is None: return season_links = [] for item in content: season_url = "https://www.championat.com" + item['data-href'] season = item.get_text().lstrip().rstrip().split("/")[0] season_links.append({'url': season_url, 'season': season}) return season_links except Exception as e: print(e) return
def parse_seasons(self): request = parsing_functions.get_request(self.url) if request is None: return try: soup = bs4.BeautifulSoup(request.text, 'html.parser') content = soup.find('div', 'js-tournament-header-year').find_all('option') if content is None: return for item in content: season_url = self.site + item['data-href'] html_season_year = item.get_text().lstrip().rstrip().split( "/")[0] season = datetime.strptime(html_season_year, '%Y') season_node = SeasonNode(season, season_url) self.root.add_node(season_node) except Exception as e: print('Ошибка парсинга сезонов') print(e)
def ParseMatches(self, matchLink: str): Url = '' try: Root = Node(key=ParsingTypes.tournament) Url = matchLink Request = parsing_functions.get_request(Url) Soup = Bs(Request.text, 'html.parser') if not Root.data: Root.data = {k: v for k, v in self.GetTournament(Soup).items()} MatchNode = Node(key=ParsingTypes.match) ExtraInfoTag = Soup.find('div', 'match-info__extra') Stadium: str = GetStadium(ExtraInfoTag.find(text=re.compile('Стадион:'))) Referee: str = GetReferee(ExtraInfoTag.find(text=re.compile('Главный судья:'))) StatTag = Soup.find('div', attrs={'data-type': 'stats'}) Stats = GetMatchStat(StatTag, StatAssociations) Lineups = GetLineups(Soup) Teams = GetTeams(Soup.find('div', 'match-info__scoreboard')) if Teams is None: return None Teams[0].set_child(Stats['Home']) Teams[1].set_child(Stats['Guest']) MatchNode.AddChildren(Teams) [Teams[0].set_child(lineup) for lineup in Lineups[0]] [Teams[1].set_child(lineup) for lineup in Lineups[1]] for goal in GetGoalsStat(StatTag): Player = MatchNode.search_node(goal['Kicker'], 'Name', ParsingTypes.Lineup) Assistant = MatchNode.search_node(goal['Assistant'], 'Name', ParsingTypes.Lineup) if Player: Goal = Node(key=ParsingTypes.Goal, data={k: v for k, v in goal.items()}) Player.set_child(Goal) if Assistant: Assist = Node(key=ParsingTypes.Assist, data={k: v for k, v in goal.items()}) Assistant.set_child(Assist) for punish in GetPunishmentsStat(StatTag): PunishNode = Node(key=ParsingTypes.Punishment, data={k: v for k, v in punish.items()}) Player = MatchNode.search_node(punish['Player'], 'Name', ParsingTypes.Lineup) if Player: Player.set_child(PunishNode) for miss in GetMissPenaltiesStat(StatTag): MissNode = Node(key=ParsingTypes.MissPenalty, data={k: v for k, v in miss.items()}) Player = MatchNode.search_node(miss['Player'], 'Name', ParsingTypes.Lineup) if Player: Player.set_child(MissNode) # Penalties = [{'PlayerId': 974, 'Result': 'Гол', 'Score':'1:0'}, # {'PlayerId': 74222, 'Result': 'Мимо', 'Score': '1:0'}, # {'PlayerId': 823, 'Result': 'Гол', 'Score': '1:1'}, # {'PlayerId': 209, 'Result': 'Гол', 'Score': '1:2'}] for penalty in GetPenalties(StatTag): Penalty = Node(key=ParsingTypes.Penalty, data={k: v for k, v in penalty.items()}) Lineup = MatchNode.search_node(penalty['PlayerId'], 'Id', ParsingTypes.Lineup) if Lineup: Lineup.set_child(Penalty) MainScore: list = GetMainScore(Soup.find('div', 'match-info__scoreboard')) PenaltyScore: list = GetPenaltyScore(Soup.find('div', 'match-info__count-extra')) TechScore: list = GetTechnicalScore(Soup.find('div', 'match-info__count-extra')) MatchNode.data = { 'Url': Url, 'Stadium': Stadium, 'Referee': Referee, 'Date': GetMatchDate(Soup.find('div', 'match-info__date')), 'Stage': GetMatchStage(Soup.find('div', 'match-info__stage')), 'IsExtra': IsExtraTime(Soup.find('div', 'match-info__count-extra')), 'HomeScore': MainScore[0], 'GuestScore': MainScore[1], 'HomePenaltyScore': PenaltyScore[0], 'GuestPenaltyScore': PenaltyScore[1], 'HomeTechScore': TechScore[0], 'GuestTechScore': TechScore[1]} Root.set_child(MatchNode) return Root except Exception as Err: self.log.exception(f"Error parsing match {Url}: {Err}")