def create_proxied_browser_instance(proxy=None, use_proxy=False, headless=False, use_data_dir=False) -> webdriver.Chrome: chrome_options = webdriver.ChromeOptions() capabilities = webdriver.DesiredCapabilities.CHROME prefs = {'disk-cache-size': 4096} if headless: chrome_options.headless = True prefs["profile.managed_default_content_settings.images"] = 2 chrome_options.add_experimental_option('prefs', prefs) chrome_options.add_argument("--start-maximized") chrome_options.add_argument("log-level=3") chrome_options.add_argument("ignore-certificate-errors") if use_data_dir: chrome_options.add_argument('user-data-dir={D}'.format(D=BROWSER_DATA)) try: extension = create_firefox_extension() except Exception as a: print(str(a)) extension = None if extension: chrome_options.add_extension(extension) driver_path = get_chrome_driver() if not driver_path: raise Exception("Chrome driver not found") driver: webdriver.Chrome = webdriver.Chrome( executable_path=driver_path, options=chrome_options, desired_capabilities=capabilities ) return driver
def scrap_player_name_flashscore(flash_id, flash_url): driver = get_chrome_driver() match_url = "https://www.flashscore.com/player/{0}/{1}/".format( flash_url, flash_id) driver.get(match_url) time.sleep(1) player_name = driver.find_element_by_class_name("teamHeader__name").text driver.quit() return player_name
def scrap_matches_at_date(matches_date): driver = get_chrome_driver() match_url = "https://www.flashscore.com/tennis" driver.get(match_url) navigate_to_date(driver, matches_date) tournament = None elements = driver.find_elements_by_xpath("//div[@class='sportName tennis']/div") for elem in elements: if element_has_class(elem, "event__header"): # Tournament header tournament = get_tournament_from_row(driver, elem, matches_date) else: # Match row if tournament is None: # Match is not to be retrieved continue process_match_row(elem, matches_date) driver.quit()
def scrap_player_id(player_name): atptour_name = atptour_id = None driver = get_chrome_driver() match_url = 'https://www.atptour.com/en/-/ajax/playersearch/PlayerUrlSearch?searchTerm={}'.format( player_name) driver.get(match_url) time.sleep(1) html = driver.find_element_by_tag_name("pre").get_attribute('innerHTML') json_obj = json.loads(html) elements = json_obj["items"] player_element = None if len(elements) == 0: names = player_name.split() if len(names) > 2: minimized_name = names[0] + " " + names[-1] driver.quit() return scrap_player_id(minimized_name) msg = "'{0}' not found on atptour website".format(player_name) log_to_file(msg, PLAYER_LOGS) log("players", msg) else: for element in elements: if str.lower(element["Key"]) == str.lower(player_name): player_element = element break if player_element is None: player_element = elements[0] atptour_name = player_element["Key"] href = player_element["Value"] href_regex = re.search(".+/(.*)/overview$", href) atptour_id = href_regex.group(1) driver.quit() return atptour_name, atptour_id
def scrap_match_flashscore(match_id, status): match = pd.Series([match_id], index=["match_id"]) driver = get_chrome_driver() try: match["match_id"] = match_id match_url = "https://www.flashscore.com/match/" + match_id driver.get(match_url) time.sleep(1) tournament_elem = driver.find_element_by_xpath( "//div[contains(@class, 'tournamentHeaderDescription')]/div[1]/span[3]/a" ) tournament_regex = re.search("atp-singles/(.*)/", tournament_elem.get_attribute("href")) match["tournament_id"] = tournament_regex.group(1) add_tournament_info(match) round_regex = re.search(",.*- (.*)$", tournament_elem.text) if round_regex: match["round"] = round_regex.group(1) else: match["round"] = "Group" match["p1_id"], match["p1_url"], match["p2_id"], match["p2_url"] = scrap_player_ids(driver) add_player_info(match) match.drop(columns=["p1_url", "p2_url"], inplace=True) match_date = None try: match_date_elem = driver.find_element_by_xpath("//div[@id='detail']/div[4]/div[1]").text match_date_regex = re.search(r"^([0-9]+)\.([0-9]+)\.([0-9]+) ([0-9]+):([0-9]+)$", match_date_elem) day = int(match_date_regex.group(1)) month = int(match_date_regex.group(2)) year = int(match_date_regex.group(3)) hour = int(match_date_regex.group(4)) minute = int(match_date_regex.group(5)) match_date = pd.to_datetime("{0} {1} {2} {3} {4}".format(year, month, day, hour, minute) , format='%Y %m %d %H %M', utc=True) except Exception as ex: msg = "Error with date format - scraping match '{}'".format(match_id) log_to_file(msg, MATCHES_ERROR_LOGS) log("scrap_match", msg, type(ex).__name__) raise Exception match["datetime"] = match_date ''' Section usefull for scrap_tournament_matches() if status is None: status_elem = driver.find_element_by_xpath("//div[@id='detail']/div[4]/div[3]/div[1]/div[2]/span[1]").text if status_elem == "Finished": status = MatchStatus.Finished else: retired_regex = re.search("retired", status_elem) if retired_regex: status = MatchStatus.Retired else: msg = "status_error - match '{}'".format(match_id) log_to_file(msg, MATCHES_ERROR_LOGS) log("scrap_match", msg) driver.quit() return None ''' match["status"] = status.name if status in [MatchStatus.Finished, MatchStatus.Retired, MatchStatus.Live, MatchStatus.Awarded, MatchStatus.Interrupted]: if status != MatchStatus.Live: # Set match winner only if match has already finished participant_elems = driver.find_elements_by_xpath("//a[starts-with(@class, 'participantName___')]") if len(participant_elems[-1].find_elements_by_xpath("strong")) == 1: match["p1_wins"] = False else: match["p1_wins"] = True duration_elem = driver.find_element_by_xpath("//div[contains(@class, 'time--overall')]").text duration_regex = re.search("([0-9]+):([0-9]+)", duration_elem) match["minutes"] = int(duration_regex.group(1)) * 60 + int(duration_regex.group(2)) match["p1_s1_gms"], match["p1_tb1_score"] = find_gms_value(1, 1, driver) match["p1_s2_gms"], match["p1_tb2_score"] = find_gms_value(1, 2, driver) match["p1_s3_gms"], match["p1_tb3_score"] = find_gms_value(1, 3, driver) match["p1_s4_gms"], match["p1_tb4_score"] = find_gms_value(1, 4, driver) match["p1_s5_gms"], match["p1_tb5_score"] = find_gms_value(1, 5, driver) match["p2_s1_gms"], match["p2_tb1_score"] = find_gms_value(2, 1, driver) match["p2_s2_gms"], match["p2_tb2_score"] = find_gms_value(2, 2, driver) match["p2_s3_gms"], match["p2_tb3_score"] = find_gms_value(2, 3, driver) match["p2_s4_gms"], match["p2_tb4_score"] = find_gms_value(2, 4, driver) match["p2_s5_gms"], match["p2_tb5_score"] = find_gms_value(2, 5, driver) driver.find_element_by_link_text("Statistics").click() time.sleep(0.5) row_elements = driver.find_elements_by_xpath("//div[starts-with(@class, 'statRow___')]") # stat_elem.find_elements_by_class_name("statRow") stat_labels = [] p1_stats = [] p2_stats = [] for row_elem in row_elements: stat_labels.append(row_elem.find_element_by_xpath("div[1]/div[2]").text) p1_stats.append(row_elem.find_element_by_xpath("div[1]/div[1]").text) p2_stats.append(row_elem.find_element_by_xpath("div[1]/div[3]").text) stats_dataframe = pd.DataFrame({"label": stat_labels, "p1": p1_stats, "p2": p2_stats}) match["p1_ace"] = int(stats_dataframe[stats_dataframe["label"] == "Aces"].iloc[0]["p1"]) match["p1_df"] = int(stats_dataframe[stats_dataframe["label"] == "Double Faults"].iloc[0]["p1"]) p1_svpt_elem = stats_dataframe[stats_dataframe["label"] == "Service Points Won"].iloc[0]["p1"] p1_svpt_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p1_svpt_elem) match["p1_svpt"] = int(p1_svpt_regex.group(3)) match["p1_svpt_won"] = int(p1_svpt_regex.group(2)) match["p1_svpt_ratio"] = int(p1_svpt_regex.group(1)) / 100 p1_1st_elem = stats_dataframe[stats_dataframe["label"] == "1st Serve Points Won"].iloc[0]["p1"] p1_1st_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p1_1st_elem) match["p1_1st_in"] = int(p1_1st_regex.group(3)) match["p1_1st_won"] = int(p1_1st_regex.group(2)) match["p1_1st_won_ratio"] = int(p1_1st_regex.group(1)) / 100 p1_2nd_elem = stats_dataframe[stats_dataframe["label"] == "2nd Serve Points Won"].iloc[0]["p1"] p1_2nd_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p1_2nd_elem) match["p1_2nd_pts"] = int(p1_2nd_regex.group(3)) match["p1_2nd_won"] = int(p1_2nd_regex.group(2)) match["p1_2nd_won_ratio"] = int(p1_2nd_regex.group(1)) / 100 p1_bp_elem = stats_dataframe[stats_dataframe["label"] == "Break Points Saved"].iloc[0]["p1"] p1_bp_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p1_bp_elem) match["p1_bp_faced"] = int(p1_bp_regex.group(3)) match["p1_bp_saved"] = int(p1_bp_regex.group(2)) match["p1_bp_saved_ratio"] = int(p1_bp_regex.group(1)) / 100 match["p2_ace"] = int(stats_dataframe[stats_dataframe["label"] == "Aces"].iloc[0]["p2"]) match["p2_df"] = int(stats_dataframe[stats_dataframe["label"] == "Double Faults"].iloc[0]["p2"]) p2_svpt_elem = stats_dataframe[stats_dataframe["label"] == "Service Points Won"].iloc[0]["p2"] p2_svpt_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p2_svpt_elem) match["p2_svpt"] = int(p2_svpt_regex.group(3)) match["p2_svpt_won"] = int(p2_svpt_regex.group(2)) match["p2_svpt_ratio"] = int(p2_svpt_regex.group(1)) / 100 p2_1st_elem = stats_dataframe[stats_dataframe["label"] == "1st Serve Points Won"].iloc[0]["p2"] p2_1st_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p2_1st_elem) match["p2_1st_in"] = int(p2_1st_regex.group(3)) match["p2_1st_won"] = int(p2_1st_regex.group(2)) match["p2_1st_won_ratio"] = int(p2_1st_regex.group(1)) / 100 p2_2nd_elem = stats_dataframe[stats_dataframe["label"] == "2nd Serve Points Won"].iloc[0]["p2"] p2_2nd_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p2_2nd_elem) match["p2_2nd_pts"] = int(p2_2nd_regex.group(3)) match["p2_2nd_won"] = int(p2_2nd_regex.group(2)) match["p2_2nd_won_ratio"] = int(p2_2nd_regex.group(1)) / 100 p2_bp_elem = stats_dataframe[stats_dataframe["label"] == "Break Points Saved"].iloc[0]["p2"] p2_bp_regex = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p2_bp_elem) match["p2_bp_faced"] = int(p2_bp_regex.group(3)) match["p2_bp_saved"] = int(p2_bp_regex.group(2)) match["p2_bp_saved_ratio"] = int(p2_bp_regex.group(1)) / 100 p1_sv_gms_elem = stats_dataframe[stats_dataframe["label"] == "Service Games Won"].iloc[0]["p1"] p1_sv_gms_rgx = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p1_sv_gms_elem) match["p1_sv_gms"] = int(p1_sv_gms_rgx.group(3)) match["p1_sv_gms_won"] = int(p1_sv_gms_rgx.group(2)) match["p1_sv_gms_won_ratio"] = int(p1_sv_gms_rgx.group(1)) / 100 p2_sv_gms_elem = stats_dataframe[stats_dataframe["label"] == "Service Games Won"].iloc[0]["p2"] p2_sv_gms_rgx = re.search(r"([0-9]+)% \(([0-9]+)/([0-9]+)", p2_sv_gms_elem) match["p2_sv_gms"] = int(p2_sv_gms_rgx.group(3)) match["p2_sv_gms_won"] = int(p2_sv_gms_rgx.group(2)) match["p2_sv_gms_won_ratio"] = int(p2_sv_gms_rgx.group(1)) / 100 match["p1_1st_serve_ratio"] = match["p1_1st_in"] / match["p1_svpt"] if match["p1_svpt"] > 0 else None match["p2_1st_serve_ratio"] = match["p2_1st_in"] / match["p2_svpt"] if match["p2_svpt"] > 0 else None except Exception as ex: msg = "Error while scraping match id '{}'".format(match_id) log_to_file(msg, MATCHES_ERROR_LOGS) log("scrap_match", msg, type(ex).__name__) match = None driver.quit() return match
def scrap_player(atp_id): driver = get_chrome_driver() match_url = 'https://www.atptour.com/en/players/player/{}/overview'.format( atp_id) driver.get(match_url) time.sleep(0.5) player = pd.Series(dtype='float64') try: player["first_name"] = driver.find_element_by_xpath( "//div[@class='player-profile-hero-name']/div[1]").text player["last_name"] = driver.find_element_by_xpath( "//div[@class='player-profile-hero-name']/div[2]").text player["first_initial"] = player["first_name"][0] if player["first_name"] is not None \ and player["first_name"] != "" else None player["full_name"] = "{0} {1}".format(player["last_name"], player["first_initial"]) birth_date = None try: birth_date_search = driver.find_element_by_xpath( "//span[@class='table-birthday']").text birth_regex = re.search(r"^\(([0-9]*)\.([0-9]*)\.([0-9]*)\)$", birth_date_search) birth_year = birth_regex.group(1) birth_month = birth_regex.group(2) birth_day = birth_regex.group(3) birth_date = datetime(int(birth_year), int(birth_month), int(birth_day)) except Exception as exc: print("problem date") player["birth_date"] = birth_date turned_pro = None try: turned_pro_str = driver.find_element_by_xpath( "//div[@class='player-profile-hero-overflow']/div[2]/div[1]/table/tbody/tr[1]/td[2]/div/div[2]" ).text turned_pro = int(turned_pro_str) except (NoSuchElementException, ValueError): pass player["turned_pro"] = turned_pro weight = None try: weight_str = driver.find_element_by_xpath( "//span[@class='table-weight-lbs']").text weight = int(weight_str) except (NoSuchElementException, ValueError): pass height = None try: height_str = driver.find_element_by_xpath( "//span[@class='table-height-cm-wrapper']").text height_regex = re.search(r"^\(([0-9]*)cm\)$", height_str) if height_regex: height = int(height_regex.group(1)) except (NoSuchElementException, ValueError, TypeError): pass player["weight"] = weight player["height"] = height flag_code = driver.find_element_by_xpath( "//div[@class='player-flag-code']").text player["flag_code"] = flag_code birth_city = birth_country = None try: birth_place = driver.find_element_by_xpath( "//div[@class='player-profile-hero-overflow']/div[2]/div[" "1]/table/tbody/tr[2]/td[1]/div/div[2]").text b_matched_location = birth_place.split(", ") if len(b_matched_location) > 1: birth_city = b_matched_location[0] birth_country = b_matched_location[-1] if not country_exists(birth_country): raise NoSuchElementException("birth_country_not_found") else: raise NoSuchElementException("birth_country_not_found") except NoSuchElementException: pass # Couldn't find player birth place, Setting birth_country with flag_code birth_country = find_country_with_flag_code(flag_code) if birth_country is None: msg = "Couldn't find birth country for player '{0}'".format( atp_id) log_to_file(msg, PLAYER_LOGS) log("players", msg) player["birth_city"] = birth_city player["birth_country"] = birth_country residence_city = residence_country = None try: residence = driver.find_element_by_xpath( "//div[@class='player-profile-hero-overflow']/div[2]/div[" "1]/table/tbody/tr[2]/td[2]/div/div[2]").text r_matched_location = residence.split(", ") if len(r_matched_location) > 1: residence_city = r_matched_location[0] residence_country = r_matched_location[-1] except NoSuchElementException: pass player["residence_city"] = residence_city player["residence_country"] = residence_country handedness = backhand = None try: hands = driver.find_element_by_xpath( "//div[@class='player-profile-hero-overflow']/div[2]/div[" "1]/table/tbody/tr[2]/td[3]/div/div[2]").text hands_matched = hands.split(", ") if len(hands_matched) > 1: handedness = hands_matched[0] backhand = hands_matched[-1] except NoSuchElementException: pass player["handedness"] = handedness player["backhand"] = backhand except Exception as ex: player = None msg = "Couldn't scrap player : atp_id= '{}'".format(atp_id) log_to_file(msg, PLAYER_LOGS) log("players", msg) print(type(ex)) driver.quit() return player
def main(): driver = get_chrome_driver(show=False) for fp in yield_filelist(RAW_HTML_DIR): _get_text_and_save_it(driver, fp) _remove_acl()
def scrap_all_player_ranks(log_file_path, pickle_db_path): driver = get_chrome_driver() try: driver.get("https://www.atptour.com/en/rankings/singles") date_str = driver.find_element_by_xpath( "//div[@class='dropdown-wrapper']/div[1]/div/div").text last_ranking_date = datetime.strptime(date_str, '%Y.%m.%d').date() today = date.today() if last_ranking_date != today: # Check if last ranking date on atptour match current date. If not, do not scrap raise ValueError() driver = get_chrome_driver(driver) driver.get( "https://www.atptour.com/en/rankings/singles?rankDate={0}&rankRange=1-5000" .format(date_str.replace(".", "-"))) ranks = [] rank_elems = driver.find_elements_by_class_name("rank-cell") for rank_elem in rank_elems: rank_str = rank_elem.text # Some low-level players has rank suffixed with T because they are ex-aequo rank_str = rank_str.replace("T", "") rank = int(rank_str) ranks.append(rank) points_elems = driver.find_elements_by_xpath( "//td[@class='points-cell']/a") rank_points = [points.text for points in points_elems] rank_points = [int(points.replace(",", "")) for points in rank_points] player_ids = [] player_elems = driver.find_elements_by_xpath( "//td[@class='player-cell']/span[1]/a[1]") for elem in player_elems: href = elem.get_attribute("href") player_id_regex = re.search("players/.*/(.*)/overview", href) player_ids.append(player_id_regex.group(1)) player_ranks = pd.DataFrame({ "rank": ranks, "player_id": player_ids, "rank_points": rank_points }) if record_all_player_ranks(player_ranks): log_to_file("Player ranks successfully updated", log_file_path) db = pickledb.load(pickle_db_path, True) db.set("update_player_ranks_date", date_str) else: raise Exception('Player ranks not recorded') except ValueError: # log_to_file("Player ranks not updated on atptour", log_file_path) pass except Exception as ex: log_to_file("player_ranks update error", log_file_path) log("Player_ranks", str(ex)) pass driver.quit()
def scrap_tournament(tournament, date): tournament = search_tournament_atptour(tournament, date) if tournament is None: return None tournament_id = tournament["atp_id"] tournament_formatted_name = tournament["atp_formatted_name"] url = None driver = get_chrome_driver() driver.maximize_window() match_url = 'https://www.atptour.com/en/tournaments/{0}/{1}/overview'.format( tournament_formatted_name, tournament_id) if url is None else url driver.get(match_url) time.sleep(1) # Wait 1 sec to avoid IP being banned for scrapping try: name = driver.find_element_by_xpath( "//div[@class='player-profile-hero-name']/div[1]").text if name == "": name = tournament_formatted_name tournament["tourney_name"] = name location = driver.find_element_by_xpath( "//div[@class='player-profile-hero-dash']/div/div[2]").text matched_location = location.split(", ") tournament["city"] = matched_location[0] tournament["country"] = matched_location[-1] date_elem = driver.find_element_by_xpath( "//div[@class='player-profile-hero-dash']/div/div[3]").text date_regex = re.search("^(.*) - .* (.*)$", date_elem) try: locale.setlocale(locale.LC_ALL, 'en_US.utf8') full_date = "{0} {1}".format(date_regex.group(1), date_regex.group(2)) tournament["start_date"] = pd.to_datetime(full_date, format='%B %d %Y', utc=True) except Exception as ex: print(ex) pass if "tourney_level" not in tournament.index: # Find tourney level from image img = driver.find_element_by_xpath( "//div[@class='tournmanet-logo']/img") img_src = img.get_attribute("src") level_matched = re.search("categorystamps_(.+)_", img_src) level = level_matched.group(1) if level_matched else None tournament["tourney_level"] = "M" if level == "1000" else "A" # maximum number of sets if "best_of" not in tournament.index: tournament["best_of"] = 3 try: number_of_competitors = int( driver.find_element_by_xpath( "//div[@class='bracket-sgl']/div[2]").text) tournament["number_of_competitors"] = number_of_competitors except ValueError: pass tournament["surface"] = driver.find_element_by_xpath( "//div[@class='surface-bottom']/div[2]").text except Exception: pass driver.quit() return tournament
def search_all_tournaments_atptour(): tournaments_atptour = None driver = get_chrome_driver() driver.get("https://www.atptour.com/en/tournaments") time.sleep(3) try: atp_names = [] atp_formatted_names = [] atp_ids = [] elements = driver.find_elements_by_xpath( "//tr[@class='tourney-result']/td[2]/a") for elem in elements: try: url = elem.get_attribute("href") url_regex = re.search("/tournaments/(.*)/(.*)/overview$", url) atp_formatted_name = url_regex.group(1) atp_id = int(url_regex.group(2)) atp_name = elem.text atp_formatted_names.append(atp_formatted_name) atp_ids.append(atp_id) atp_names.append(atp_name) except Exception as ex: atp_formatted_names.append(None) atp_ids.append(None) atp_names.append(None) msg = "atp tournaments retrieval error, tournament '{0}'".format( elem.text) log_to_file(msg, TOURNAMENT_LOGS) log("tournaments", msg, type(ex).__name__) cities = [] countries = [] elements = driver.find_elements_by_xpath( "//tr[@class='tourney-result']/td[2]/span[1]") for elem in elements: location = elem.text try: matched_location = location.split(", ") city = matched_location[0] country = matched_location[-1] cities.append(city) countries.append(country) except Exception as ex: cities.append(None) countries.append(None) msg = "atp tournaments retrieval error, location '{0}'".format( location) log_to_file(msg, TOURNAMENT_LOGS) log("tournaments", msg, type(ex).__name__) start_dates = [] end_dates = [] elements = driver.find_elements_by_xpath( "//tr[@class='tourney-result']/td[2]/span[2]") for elem in elements: date_elem = elem.text try: date_regex = re.search("^(.*) - (.*)$", date_elem) start_date_str = date_regex.group(1) start_date = datetime.strptime(start_date_str, '%Y.%m.%d') end_date_str = date_regex.group(2) end_date = datetime.strptime(end_date_str, '%Y.%m.%d') end_date += timedelta(days=1) start_dates.append(start_date) end_dates.append(end_date) except Exception as ex: start_dates.append(None) end_dates.append(None) #print(type(ex).__name__) #print("atp tournaments retrieval error, date_elem: '{0}'".format(date_elem)) tournaments_atptour = pd.DataFrame({ "atp_id": atp_ids, "atp_name": atp_names, "atp_formatted_name": atp_formatted_names, "city": cities, "country": countries, "start_date": start_dates, "end_date": end_dates }) except Exception as ex: msg = "Tournament header retrieval error" log_to_file(msg, TOURNAMENT_LOGS) log("tournaments", msg, type(ex).__name__) driver.quit() return tournaments_atptour