def get_player_team_info(page): """ Get the info of teams that a athlete plays.""" start_token = r'a href="\/team\/[\d]+\/[\w-]+\/"' start_token = re.compile(start_token) flag = False try: end_token = "</figure>" pages = parser.retrieve_in_tags(start_token, end_token, page)[0] except: end_token = '<div class="operation spacing">' pages = parser.retrieve_in_tags(start_token, end_token, page)[0] flag = True team_info = _principal_team_info(pages) national_info = { "Nat. Team": None, "Jersey Nat.": None, "Nat. Position": None, "Nat. Team Skill": None } if not flag: # there is a national team end_token = "/li></ul></div></div>" pages = parser.retrieve_in_tags(start_token, end_token, page)[1] national_info = _national_team_info(pages) return {**team_info, **national_info}
def get_players(team_name, team_id, season): """ Get the players from a team. Return a dict of players names and ID. """ link = parser.team_detailed_link_assemble(team_name, team_id, season) players_page = crawler.get_page(link) begin_token = '<a name="zugaenge" class="anchor">' end_token = '<div class="werbung werbung-fullsize_contentad">' page = parser.cut_page(begin_token, end_token, players_page) begin_token = '<td class="hauptlink">' pages = parser.retrieve_in_tags(begin_token, '/a>', page, False) # inside the pages, we must have a href pages = list(filter(lambda x: 'href' in x, pages)) players_info = {} for page in pages: player_id = parser.retrieve_in_tags('id="', '"', page) player_name = parser.retrieve_in_tags(player_id+'">', '<', page) if player_name is not None: players_info[player_id] = player_name return players_info
def get_team_result(chunck): """ Given a chunck of unparsed text, retrieve all needed information. """ info = {} info["Club"] = parser.retrieve_in_tags('alt="', '"', chunck, False)[0] info['Club Id'] = parser.retrieve_in_tags('id="', '"', chunck, False)[0] results = parser.retrieve_in_tags(">", "<", chunck, False) results = list( filter(lambda x: re.match(r'[\d\-:]+', x) and x != '' and ' ' not in x, results)) info['Position'] = results[0] info['Matches'] = results[1] info['Win'] = results[2] info['Draw'] = results[3] info['Lose'] = results[4] goals = results[5].split(':') info['Scored Goals'] = goals[0] info['Taken Goals'] = goals[1] info['Balance'] = results[6] info['Points'] = results[7] return info
def get_manager_history(manager_name, manager_id): ''' Get all team that a manager worked. ''' link = parser.manager_detailed_link(manager_name, manager_id) manager_page = crawler.get_page(link) begin_token = '<td class="zentriert no-border-rechts">' end_token = '</tr>' stories = parser.retrieve_in_tags(begin_token, end_token, manager_page, False) if stories is None: return None history = [] for story in stories: info = {} info['Manager Id'] = manager_id info['Team'] = parser.retrieve_in_tags('alt="', '"', story, False)[0] info['Id'] = set(parser.retrieve_in_tags('id="', '"', story, False)) tokens_tag = parser.parse_in_tags(story, False) info['Appointed'] = tokens_tag[1].replace(" ", '') info['Contract'] = tokens_tag[2].replace(" ", '') info['Position'] = tokens_tag[3] info['\\# Matches'] = tokens_tag[4] info['Points Per Match'] = tokens_tag[5] history.append(info) return history
def _add_info_parser(start, end, page, tokens): """Return the fileds of additional info""" add_info = parser.retrieve_in_tags(start, end, page)[0] add_info = parser.retrieve_in_tags('>', '<', add_info) add_info = list( filter( lambda x: '>' not in x and x not in tokens and not re.match( r'[\s,]+', x), add_info)) return add_info
def _get_birth_date(page): """ Getting and parsing the birth date of a player.""" token_re = r'class="pos pos[\d]*">[A-Z]*</span>[\',"()\d A-z]*</div>' token_re = re.compile(token_re) date = parser.get_unparsed_text(page, token_re)[0] date = parser.retrieve_in_tags("</span>", '</div>', date)[0] re_one = re.compile(r'\(') re_two = re.compile(r'\)') date = parser.retrieve_in_tags(re_one, re_two, date)[0] return parser.parse_date(date)
def get_tags(page): """ Get tags with players topics.""" info = {} token = '<div class="mt-2">' tags = parser.retrieve_in_tags(token, "</div>", page)[0] tags = parser.retrieve_in_tags("#", "<", tags) if tags is not None: info['Tags'] = str(len(tags)) else: info['Tags'] = tags return info
def _get_birth_place(page): """ Getting the birth place of a player""" token_re = r'title="[A-z]*"><img alt="" src=""' token_re += r' data-src="https://cdn.sofifa.org/flags/[\d]*.png' token_re = re.compile(token_re) place = parser.get_unparsed_text(page, token_re)[0] return parser.retrieve_in_tags('title="', '"', place)[0]
def _get_weight(page): """Returns the players weight.""" token = r'class="pos pos[\d]*">[A-Z]*</span>' token += r' .*\) [\d]*\'[\d]*\" [\w]*</div>' token = re.compile(token) weight = parser.get_unparsed_text(page, token)[0] return parser.retrieve_in_tags('" ', '<', weight)[0]
def _get_height(page): """Returns the players height""" token = r'class="pos pos[\d]*">[A-Z]*</span>' token += r' .*\) [\d]*\'[\d]*\"' token = re.compile(token) height = parser.get_unparsed_text(page, token)[0] return parser.retrieve_in_tags(r'\) ', '"', height)[0] + '"'
def _get_edition_release(page): """Returns the edition and release of FIFA""" token = 'class="bp3-tag bp3-minimal bp3-intent-success">' aux = parser.retrieve_in_tags(token, '<', page)[0] aux = aux.split(' ') aux.pop(0) # Removing the tag FIFA aux.pop(0) # Removing the edition return parser.parse_date(' '.join(aux))
def get_defensive_info(page): """ Get a player defensive skills. Marking St. Tackle Sliding Tackle """ token = '<h5 class="bp3-heading">Defending</h5>' def_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(def_page)
def get_teams(league_link): """ Return all the teams of a given league. """ league_page = crawler.get_page(league_link) league_page = parser.cut_page('id="verein_select_breadcrumb"', "</select>", league_page) clubs_id = parser.retrieve_in_tags('value="', '">', league_page) clubs_name = parser.retrieve_in_tags('>', '<', league_page) clubs_id = parser.remove_token(clubs_id, ['', ' ']) # letting only digts on the list clubs_id = list(filter(lambda x: re.match(r'\d', x), clubs_id)) clubs_name = parser.remove_token(clubs_name, ['\n', 'Club']) return { int(clubs_id[index]): name for index, name in enumerate(clubs_name) }
def get_goalkeeping_info(page): """Get a player goalkeeping skills. GK Diving GK Handling GK Kicking GK Positioning GK Reflexes """ token = '<h5 class="bp3-heading">Goalkeeping</h5>' goal_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(goal_page)
def get_attacking_info(page): """Get a player attacking skills. Crossing Finishing Heading Accuracy Short Passing Volleys """ token = '<h5 class="bp3-heading">Attacking</h5>' attack_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(attack_page)
def get_skill_info(page): """Get a player skills info. Dribbling Curve FK Accuracy Long Pass Ball Control """ token = '<h5 class="bp3-heading">Skill</h5>' skill_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(skill_page)
def get_movement_info(page): """Get a player movement skills. Acceleration Sprint Speed Agility Reactions Balance """ token = '<h5 class="bp3-heading">Movement</h5>' mov_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(mov_page)
def get_power_info(page): """Get a player power skills. Shot Power Jumping Stamina Strength Long Shots """ token = '<h5 class="bp3-heading">Power</h5>' power_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(power_page)
def _principal_team_info(page): """ Return the basic info of a players team. Team Team Position Team Skill Jersey Team Joined Contract """ info = {} info['Team'] = parser.retrieve_in_tags(">", "<", page) token = r'class="pos pos[\d]*">' token = re.compile(token) info['Team Position'] = parser.retrieve_in_tags(token, '<', page) token = r'span class="bp3-tag p p[\d]*">' token = re.compile(token) info['Team Skill'] = parser.retrieve_in_tags(token, '<', page) token = "Jersey Number</label>" info['Jersey'] = parser.retrieve_in_tags(token, "<", page) token = "Joined</label>" info['Joined'] = parser.retrieve_in_tags(token, '<', page) info['Joined'] = parser.parse_date(info['Joined']) token = "Contract Valid Until</label>" info['Contract'] = parser.retrieve_in_tags(token, '<', page) info = _set_none(info) return info
def get_player_transfer(player_page, player_id): """ Get the transfers made along a player career. """ player_page = parser.cut_page('<div class="box transferhistorie">', "</tfoot>", player_page) pages = parser.retrieve_in_tags('<tr class="zeile-transfer">', '</tr>', player_page, False) transfers = [] if pages is None: return pages for page in pages: info = {} info['Player Id'] = player_id info['Season'] = parser.retrieve_in_tags( 'class="zentriert hide-for-small"', '</td>', page)[0] info['Fee'] = parser.retrieve_in_tags('zelle-abloese', '<', page) info['Market Value'] = parser.retrieve_in_tags('zelle-mw', '<', page) clubs_name = parser.retrieve_in_tags('vereinsname', '</a>', page) # make a set without sorting the list clubs_id = list( OrderedDict.fromkeys(parser.retrieve_in_tags('id="', '"', page))) # The even values are the teams nickname info['Team A'], info['Team B'] = clubs_name[1], clubs_name[3] info['ID Team A'], info['ID Team B'] = clubs_id[0], clubs_id[1] transfers.append(info) return transfers
def _national_team_info(page): """ Get the player national team info. Nat. Team Jersey Nat. Nat. Position Nat. Team Skill """ info = {} info['Nat. Team'] = parser.retrieve_in_tags(">", "<", page)[0] token = "Jersey Number</label>" info['Jersey Nat.'] = parser.retrieve_in_tags(token, "<", page)[0] token = r'class="pos pos[\d]*">' token = re.compile(token) info['Nat. Position'] = parser.retrieve_in_tags(token, '<', page)[0] token = r'span class="bp3-tag p p[\d]*">' token = re.compile(token) info['Nat. Team Skill'] = parser.retrieve_in_tags(token, '<', page)[0] return info
def get_mentality_info(page): """Get a player mentality skills. Aggression Interceptions Positioning Vision Penalties Composure """ token = '<h5 class="bp3-heading">Mentality</h5>' mental_page = parser.retrieve_in_tags(token, '</div>', page)[0] return _parse_skills(mental_page)
def _get_position(page): """Getting a player position""" token_re = r'class="pos pos[\d]*">[A-Z]*</span></li>' token_re = re.compile(token_re) positions = parser.get_unparsed_text(page, token_re) pos = [] for position in positions: ans = parser.retrieve_in_tags('>', '<', position) if ans is not None: pos.append(ans) return ' '.join(ans)
def get_team_info(team_name, team_id, season): """ Get teams info. '' Returns a dict with all team info """ link = parser.team_link_assemble(team_name, team_id, season) team_page = crawler.get_page(link) team_info = {} team_info["Name"] = team_name team_info["Squad Id"] = team_id team_info["Season"] = season token = 'class="container-hauptinfo">' team_info["Manager"] = parser.retrieve_in_tags(token, "</a>", team_page) team_info["Manager Id"] = parser.retrieve_in_tags("profil/trainer/", '">', team_page) for key in ['Manager', 'Manager Id']: if isinstance(team_info[key], str): team_info[key] = [team_info[key]] team_info["Income"] = parser.retrieve_in_tags('class="greentext rechts">', "</td>", team_page) team_info['Income'] = parser.remove_tokens(team_info['Income'], ['\t', '\n']) team_info["Expend."] = parser.retrieve_in_tags('class="redtext rechts">', "</td>", team_page)[0] team_info['Expend.'] = parser.remove_tokens(team_info['Expend.'], ['\t', '\n']) parsed_season = parser.parse_season(season) titles_link = parser.titles_link_assemble(team_name, team_id) titles_page = crawler.get_page(titles_link) titles = parser.retrieve_in_tags("<h2", "<h2>", titles_page, False) season_titles = [] for title in titles: if parsed_season in title: season_titles.append(parser.retrieve_in_tags(">", "</h2>", title)) season_titles = list(map(lambda x: re.sub(r'[\d]+x ', '', x), season_titles)) if not season_titles: team_info['Titles'] = None else: team_info['Titles'] = ','.join(season_titles) return team_info
def get_results(league_link, season): """ Get all results the classification table of a league.""" league_link = parser.league_result_assemble(league_link, season) league_page = crawler.get_page(league_link) league_page = parser.cut_page('<div class="responsive-table">', '</table>', league_page) chuncks = parser.retrieve_in_tags("<tr>", "</tr>", league_page, parse=False)[1:] info = list(map(get_team_result, chuncks)) return info
def _parse_skills(page): """Parse players skills . """ skills = parser.retrieve_in_tags('>', '<', page) skills = list( filter( lambda x: x != " " and (re.match(r'[\d]+', x) or re.match(r'[A-z ]+', x)), skills)) info = {} index = 0 while index < len(skills) - 1: if skills[index + 1][0] == ' ': info[skills[index + 1][1:]] = skills[index] else: info[skills[index + 1]] = skills[index] index = index + 2 return info
def get_manager_info(manager_name, manager_id): """ Get managers info. """ link = parser.manager_link_assemble(manager_name, manager_id) manager_page = crawler.get_page(link) manager_info = {} manager_info['Name'] = manager_name.replace('-', ' ') manager_info['Id'] = manager_id token = "Date of Birth:" manager_info['Birth Date'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = 'itemprop="birthPlace">' manager_info['Birth Place'] = parser.retrieve_in_tags( token, "</span>", manager_page) token = 'itemprop="nationality">' manager_info['Nationality'] = parser.retrieve_in_tags( token, "</span>", manager_page) token = "Avg. term as manager:" manager_info['Avg. term'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = "Coaching Licence:" manager_info['Coaching License'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = "Preferred Formation" manager_info[token] = parser.retrieve_in_tags(token + ':', "</td>", manager_page) manager_info['History'] = get_manager_history(manager_name, manager_id) return manager_info
def player_matches(player_page): ''' Get all the matches that a player has played in a season.''' # return unparsed matches that a player played start = '<tr ><th scope="row"' end = '</td></tr>' matches = list( map(lambda x: parser.retrieve_in_tags(start, end, x), player_page)) # Now I want to return the values between the tags plays = [] for match in matches: plays += match token_match = 'On matchday squad, but did not play' invalid_tokens = ['', ' ', 'Match Report', 'Away', 'Home'] matches = [] for index, play in enumerate(plays): match_info = {} token = r'href="/en/matches/.*?">' match_info['Date'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="dayofweek".*?>' match_info['Day'] = parser.retrieve_in_tags(token, '<', play)[0] # That is the competition token = r'href="/en/comps/.*?>' results = parser.retrieve_in_tags(token, '<', play) if len(results) == 1: match_info['Comp.'] = None match_info['Round'] = results[0] else: match_info['Comp.'] = results[0] match_info['Round'] = results[1] #print([(a.end()) for a in list(re.finditer(token, play))]) if token_match in play: continue token = r'data-stat="result".*?>' match_info['Result'] = parser.retrieve_in_tags(token, '<', play) match_info['Result'] = match_info['Result'][0].replace('–', '-') token = r'href="/en/squads/.*?>' results = parser.retrieve_in_tags(token, '<', play) match_info['Squad'] = results[0] match_info['Oponent'] = results[1] token = r'data-stat="game_started".*?>' match_info['Start'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="minutes".*?>' match_info['Min. Played'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="goals".*?>' match_info['Goals'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="assists".*?>' match_info['Assist.'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="shots_total".*?>' match_info['Shots'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="shots_on_target".*?>' match_info['Sh. On Target.'] = parser.retrieve_in_tags( token, '<', play)[0] token = r'data-stat="crosses".*?>' match_info['Crosses'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="fouled".*?>' match_info['Fouls Drawn'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="pens_made".*?>' match_info['Pen. Kicks'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="pens_att".*?>' match_info['PK attempt'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="tackles_won".*?>' match_info['Tackles Won'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="interceptions".*?>' match_info['Interceptions'] = parser.retrieve_in_tags( token, '<', play)[0] token = r'data-stat="fouls".*?>' match_info['Fouls Commited'] = parser.retrieve_in_tags( token, '<', play)[0] token = r'data-stat="cards_yellow".*?>' match_info['Yellow C.'] = parser.retrieve_in_tags(token, '<', play)[0] token = r'data-stat="cards_red".*?>' match_info['Red C.'] = parser.retrieve_in_tags(token, '<', play)[0] for key in match_info.keys(): if match_info[key] is not None and '<' in match_info[key]: match_info[key] = 0 matches.append(match_info) return matches
def get_player(player_id, player_name, season, header=False): """ Return all statistics of a player. """ print("Evaluating ", player_name, "in season:", season) player_info = {} link = parser.match_logs_link(player_id, parser.soccer_season(season), player_name) player_page = parser.get_page(link) # when a not valid link is returned if "<tbody>" not in player_page: link = parser.logs_link(player_id, str(season), player_name) player_page = parser.get_page(link) player_info['Name'] = player_name player_info['Id'] = player_id token = "Position:</strong>" player_info['Position'] = parser.retrieve_in_tags(token, '<', player_page, parse=True) player_info['Position'] = parse_position(player_info['Position']) player_info['Position'] = player_info['Position'].replace('\n', '') token = 'Footed:</strong>' player_info['Foot'] = parser.retrieve_in_tags(token, '<', player_page, parse=True) player_info['Foot'] = player_info['Foot'] token = 'itemprop="height">' player_info['Height'] = parser.retrieve_in_tags(token, '<', player_page, parse=True) token = 'itemprop="weight">' player_info['Weight'] = parser.retrieve_in_tags(token, '<', player_page, parse=True) player_info['Season'] = parser.soccer_season(season) token = 'data-birth="' end = '">' key = 'Birth Date' player_info[key] = parser.retrieve_in_tags(token, end, player_page, parse=True) token = 'itemprop="birthPlace">' end = '</span>' key = 'Birth Place' player_info[key] = parser.retrieve_in_tags(token, end, player_page, parse=True) player_info[key] = parser.remove_tokens(player_info[key], [' ', '\n']) player_info[key] = player_info[key].replace('in', '') token = 'National Team:</strong>' key = 'National Team' player_info[key] = parser.retrieve_in_tags(token, '<span', player_page, parse=True) player_info[key] = player_info[key].replace(' ', '') if player_info[key][-1] == ' ': player_info[key] = player_info[key][:-1] player_info = replace_none(player_info) player_page = parser.retrieve_in_tags("<tbody>", "</tbody>", player_page) matches_info = player_matches(player_page) player_info['Matches'] = matches_info parser.write_file(player_info, header)
def get_basic_info(page): """ Get a player basic info. Here we will get: - Name - Complete Name - ID - Edition - Release - Birth Date - Birth Place - Height - Weight - Position - Value - Wage - Foot - Intern. Rep. - Weak Foot - Skills Moves - Work Rate - Body Type - Release Clause - Overall - Potential """ info = {} info['Complete Name'] = _get_complete_name(page) info['Release Date'] = _get_edition_release(page) info['Position'] = _get_position(page) info['Birth Date'] = _get_birth_date(page) info['Birth Place'] = _get_birth_place(page) info['Height'] = _get_height(page) info['Weight'] = _get_weight(page).replace('lbs', '') token = r'Value [\n\t]*<span>' info['Value'] = parser.retrieve_in_tags(token, '<', page) token = r'Wage [\n\t]*<span>' info['Wage'] = parser.retrieve_in_tags(token, '<', page) token = "Preferred Foot</label>" info['Foot'] = parser.retrieve_in_tags(token, '<', page) token = "International Reputation</label>" info['Intern. Rep.'] = parser.retrieve_in_tags(token, '<', page) token = 'Weak Foot</label>' info['Weak Foot'] = parser.retrieve_in_tags(token, '<', page) token = "Skill Moves</label>" info['Skill Moves'] = parser.retrieve_in_tags(token, '<', page) token = "Work Rate</label><span>" info['Work Rate'] = parser.retrieve_in_tags(token, '<', page) token = "Release Clause</label><span>" info['Release Clause'] = parser.retrieve_in_tags(token, '<', page) token = r'class="bp3-tag p p[\d]+">[\d]+</span>.* Overall Rating' token = re.compile(token) aux = parser.get_unparsed_text(page, token)[0] info['Overall'] = parser.retrieve_in_tags('>', '<', aux) token = r'class="bp3-tag p p[\d]+">[\d]+</span>.* Potential ' token = re.compile(token) aux = parser.get_unparsed_text(page, token)[0] info['Potential'] = parser.retrieve_in_tags('>', '<', aux) info = _set_none(info) return info