import sys from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing # Command line input year = str(sys.argv[1]) start_index = str(sys.argv[2]) # Setup year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year url_prefix = "http://www.atpworldtour.com" # STEP 1: Parse tourney URLs year_tree = html_parse_tree(year_url) tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href" tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath) tourney_count = len(tourney_url_suffixes) print '' print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str( tourney_count) + '\x1b[0m' + ' tournaments:' print '' print 'Index Tourney slug Matches' print '----- ------------ -------' # Iterate through each tournament match_stats_data_scrape = [] for i in xrange(int(start_index), tourney_count): # Parse tourney tree tourney_url = url_prefix + tourney_url_suffixes[i] tourney_tree = html_parse_tree(tourney_url)
" weeks...") print("") print("Index Week") print("----- ----") #for h in xrange(index, 1): #for h in xrange(index, len(weeks_list)): for h in range(start_index, end_index + 1): week = weeks_list[h][0] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" player_count_parsed = xpath_parse(week_tree, player_count_xpath) player_count_cleaned = regex_strip_array(player_count_parsed) player_count = len(player_count_cleaned) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/text()" player_name_parsed = xpath_parse(week_tree, player_name_xpath) player_name_cleaned = regex_strip_array(player_name_parsed) country_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='country-cell']/div/div/img/@alt" country_parsed = xpath_parse(week_tree, country_xpath) country_cleaned = regex_strip_array(country_parsed)
from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv weeks_url = "http://www.atpworldtour.com/en/rankings/singles" weeks_tree = html_parse_tree(weeks_url) weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value" weeks_parsed = xpath_parse(weeks_tree, weeks_xpath) weeks_cleaned = regex_strip_array(weeks_parsed) weeks_list = [[week] for week in weeks_cleaned] # Output to CSV filename = 'weeks' array2csv(weeks_list, filename)
import sys from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing # Command line input year = str(sys.argv[1]) start_index = str(sys.argv[2]) # Setup year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year url_prefix = "http://www.atpworldtour.com" # STEP 1: Parse tourney URLs year_tree = html_parse_tree(year_url) tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href" tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath) tourney_count = len(tourney_url_suffixes) print '' print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(tourney_count) + '\x1b[0m' + ' tournaments:' print '' print 'Index Tourney slug Matches' print '----- ------------ -------' # Iterate through each tournament match_stats_data_scrape = [] for i in xrange(int(start_index), tourney_count): # Parse tourney tree tourney_url = url_prefix + tourney_url_suffixes[i] tourney_tree = html_parse_tree(tourney_url)
print("Index Week") print("----- ----") for h in range(start_index, end_index + 1): week = weeks_list[h][0] dateList = week.split("-") yearRanking = dateList[0] monthRanking = dateList[1] dayRanking = dateList[2] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@data-ga-label" player_name_parsed = xpath_parse(week_tree, player_name_xpath) player_name_cleaned = regex_strip_array(player_name_parsed) move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()" move_parsed = xpath_parse(week_tree, move_xpath) move_cleaned = regex_strip_array(move_parsed) age_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='age-cell']/text()" age_parsed = xpath_parse(week_tree, age_xpath) age_cleaned = regex_strip_array(age_parsed) points_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='points-cell']/a/text()"
start = time.time() with open('rankings_0_2019-07-01.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') row1 = next(readCSV) new_row = row1 new_row.append('profile_picture') new_rows.append(new_row) for row in readCSV: print(row[5]) new_row = row profile_url = "http://www.atptour.com" + row[11] profile_tree = html_parse_tree(profile_url) player_thumbnaul_xpath = "//meta[@name='thumbnail']/@content" player_thumbnail_parsed = xpath_parse(profile_tree, player_thumbnaul_xpath) if not player_thumbnail_parsed: player_thumbnail_parsed = [''] player_thumbnail_cleaned = regex_strip_array(player_thumbnail_parsed) new_row.append(player_thumbnail_cleaned[0]) new_rows.append(new_row) # Get profile data player_id = row[15] player_slug = row[12] first_name_xpath = "//div[@id='playerProfileHero']/div[@class='player-profile-hero-overflow']/div/div/div/div[@class='first-name']/text()" first_name_parsed = xpath_parse(profile_tree, first_name_xpath) first_name_cleaned = regex_strip_array(first_name_parsed)
print "Collecting weekly rankings data from " + str(len(weeks_list)) + " weeks..." print "" print "Index Week" print "----- ----" #for h in xrange(index, 1): #for h in xrange(index, len(weeks_list)): for h in xrange(start_index, end_index + 1): week = weeks_list[h][0] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" player_count_parsed = xpath_parse(week_tree, player_count_xpath) player_count_cleaned = regex_strip_array(player_count_parsed) player_count = len(player_count_cleaned) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_url_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@href" player_url_parsed = xpath_parse(week_tree, player_url_xpath) player_url_cleaned = regex_strip_array(player_url_parsed) move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()" move_parsed = xpath_parse(week_tree, move_xpath) move_cleaned = regex_strip_array(move_parsed)
from functions import html_parse_tree, xpath_parse, regex_strip_array weeks_url = "http://www.atpworldtour.com/en/rankings/singles" weeks_tree = html_parse_tree(weeks_url) weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value" weeks_parsed = xpath_parse(weeks_tree, weeks_xpath) weeks_cleaned = regex_strip_array(weeks_parsed) for row in weeks_cleaned: print row