def get_team_info(team_name, team_id, season): """ Get teams info. '' Returns a dict with all team info """ link = parser.team_link_assemble(team_name, team_id, season) team_page = crawler.get_page(link) team_info = {} team_info["Name"] = team_name team_info["Squad Id"] = team_id team_info["Season"] = season token = 'class="container-hauptinfo">' team_info["Manager"] = parser.retrieve_in_tags(token, "</a>", team_page) team_info["Manager Id"] = parser.retrieve_in_tags("profil/trainer/", '">', team_page) for key in ['Manager', 'Manager Id']: if isinstance(team_info[key], str): team_info[key] = [team_info[key]] team_info["Income"] = parser.retrieve_in_tags('class="greentext rechts">', "</td>", team_page) team_info['Income'] = parser.remove_tokens(team_info['Income'], ['\t', '\n']) team_info["Expend."] = parser.retrieve_in_tags('class="redtext rechts">', "</td>", team_page)[0] team_info['Expend.'] = parser.remove_tokens(team_info['Expend.'], ['\t', '\n']) parsed_season = parser.parse_season(season) titles_link = parser.titles_link_assemble(team_name, team_id) titles_page = crawler.get_page(titles_link) titles = parser.retrieve_in_tags("<h2", "<h2>", titles_page, False) season_titles = [] for title in titles: if parsed_season in title: season_titles.append(parser.retrieve_in_tags(">", "</h2>", title)) season_titles = list(map(lambda x: re.sub(r'[\d]+x ', '', x), season_titles)) if not season_titles: team_info['Titles'] = None else: team_info['Titles'] = ','.join(season_titles) return team_info
def get_player_info(player_name, player_id): """ Get the information about a player. """ link = parser.player_link_assemble(player_name, player_id) player_page = crawler.get_page(link) if not player_page: player_page = crawler.get_page(link, error=True) player_info = {} player_info['Transfers'] = get_player_transfer(player_page, player_id) player_info['Name'] = player_name.replace('-', ' ').capitalize() player_info['Id'] = player_id player_info['Full Name'] = parser.retrieve_in_tags("Full Name:</th>", "</td>", player_page) player_info['Birth Date'] = parser.retrieve_in_tags( "Date of Birth:", "</a>", player_page) span = '</span>' player_info['Birth Place'] = parser.retrieve_in_tags( '"birthPlace">', span, player_page) token = 'itemprop="nationality">' player_info['Nationality'] = parser.retrieve_in_tags( token, span, player_page) player_info['Age'] = parser.retrieve_in_tags("Age:</th>", "</td>", player_page) player_info['Height'] = parser.retrieve_in_tags('itemprop="height"', span, player_page) player_info['Position'] = parser.retrieve_in_tags("Position:</span>", "</p>", player_page) player_info['Foot'] = parser.retrieve_in_tags("Foot:", "</td>", player_page) player_info['Agent'] = parser.retrieve_in_tags("Player Agents:", "</a>", player_page) player_info['Joined'] = parser.retrieve_in_tags("Joined:</span>", span, player_page) token = "Contract until:</span>" player_info['Contract Length'] = parser.retrieve_in_tags( token, span, player_page) player_info['Outfiter'] = parser.retrieve_in_tags("Outfitter:", "</td>", player_page) return player_info
def get_manager_history(manager_name, manager_id): ''' Get all team that a manager worked. ''' link = parser.manager_detailed_link(manager_name, manager_id) manager_page = crawler.get_page(link) begin_token = '<td class="zentriert no-border-rechts">' end_token = '</tr>' stories = parser.retrieve_in_tags(begin_token, end_token, manager_page, False) if stories is None: return None history = [] for story in stories: info = {} info['Manager Id'] = manager_id info['Team'] = parser.retrieve_in_tags('alt="', '"', story, False)[0] info['Id'] = set(parser.retrieve_in_tags('id="', '"', story, False)) tokens_tag = parser.parse_in_tags(story, False) info['Appointed'] = tokens_tag[1].replace(" ", '') info['Contract'] = tokens_tag[2].replace(" ", '') info['Position'] = tokens_tag[3] info['\\# Matches'] = tokens_tag[4] info['Points Per Match'] = tokens_tag[5] history.append(info) return history
def get_players(team_name, team_id, season): """ Get the players from a team. Return a dict of players names and ID. """ link = parser.team_detailed_link_assemble(team_name, team_id, season) players_page = crawler.get_page(link) begin_token = '<a name="zugaenge" class="anchor">' end_token = '<div class="werbung werbung-fullsize_contentad">' page = parser.cut_page(begin_token, end_token, players_page) begin_token = '<td class="hauptlink">' pages = parser.retrieve_in_tags(begin_token, '/a>', page, False) # inside the pages, we must have a href pages = list(filter(lambda x: 'href' in x, pages)) players_info = {} for page in pages: player_id = parser.retrieve_in_tags('id="', '"', page) player_name = parser.retrieve_in_tags(player_id+'">', '<', page) if player_name is not None: players_info[player_id] = player_name return players_info
def get_photo_link_for_article(url): url += '/foto' content = get_page(url) # Now try to find the photo if it exists. img = re.compile("<img alt=\"\" src=\"(.*)\?width=400\"/>") m = img.findall(content) if len(m) > 0: return m[0] return None
def get_photo_link_for_article(url): url += '/foto' content = get_page(url) # Now try to find the photo if it exists. img = re.compile("<img src=\"(.*)\?width=605\" alt=\"\"/>") m = img.findall(content) if len(m) > 0: return m[0] return None
def crawler_thread(page_queue, movie_id_queue): while not page_queue.empty(): page_number = page_queue.get() page_queue.task_done() html = get_page(page_number) if html is None: print("Failed to request page: " + str(page_number)) continue movies = process_page(html) for movie in movies: movie_id_queue.put(movie)
def get_results(league_link, season): """ Get all results the classification table of a league.""" league_link = parser.league_result_assemble(league_link, season) league_page = crawler.get_page(league_link) league_page = parser.cut_page('<div class="responsive-table">', '</table>', league_page) chuncks = parser.retrieve_in_tags("<tr>", "</tr>", league_page, parse=False)[1:] info = list(map(get_team_result, chuncks)) return info
def get_teams(league_link): """ Return all the teams of a given league. """ league_page = crawler.get_page(league_link) league_page = parser.cut_page('id="verein_select_breadcrumb"', "</select>", league_page) clubs_id = parser.retrieve_in_tags('value="', '">', league_page) clubs_name = parser.retrieve_in_tags('>', '<', league_page) clubs_id = parser.remove_token(clubs_id, ['', ' ']) # letting only digts on the list clubs_id = list(filter(lambda x: re.match(r'\d', x), clubs_id)) clubs_name = parser.remove_token(clubs_name, ['\n', 'Club']) return { int(clubs_id[index]): name for index, name in enumerate(clubs_name) }
def get_manager_info(manager_name, manager_id): """ Get managers info. """ link = parser.manager_link_assemble(manager_name, manager_id) manager_page = crawler.get_page(link) manager_info = {} manager_info['Name'] = manager_name.replace('-', ' ') manager_info['Id'] = manager_id token = "Date of Birth:" manager_info['Birth Date'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = 'itemprop="birthPlace">' manager_info['Birth Place'] = parser.retrieve_in_tags( token, "</span>", manager_page) token = 'itemprop="nationality">' manager_info['Nationality'] = parser.retrieve_in_tags( token, "</span>", manager_page) token = "Avg. term as manager:" manager_info['Avg. term'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = "Coaching Licence:" manager_info['Coaching License'] = parser.retrieve_in_tags( token, "</td>", manager_page) token = "Preferred Formation" manager_info[token] = parser.retrieve_in_tags(token + ':', "</td>", manager_page) manager_info['History'] = get_manager_history(manager_name, manager_id) return manager_info