from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv weeks_url = "http://www.atpworldtour.com/en/rankings/singles" weeks_tree = html_parse_tree(weeks_url) weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value" weeks_parsed = xpath_parse(weeks_tree, weeks_xpath) weeks_cleaned = regex_strip_array(weeks_parsed) weeks_list = [[week] for week in weeks_cleaned] # Output to CSV filename = 'weeks' array2csv(weeks_list, filename)
print("") print("Index Week") print("----- ----") #for h in xrange(index, 1): #for h in xrange(index, len(weeks_list)): for h in range(start_index, end_index + 1): week = weeks_list[h][0] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" player_count_parsed = xpath_parse(week_tree, player_count_xpath) player_count_cleaned = regex_strip_array(player_count_parsed) player_count = len(player_count_cleaned) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/text()" player_name_parsed = xpath_parse(week_tree, player_name_xpath) player_name_cleaned = regex_strip_array(player_name_parsed) country_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='country-cell']/div/div/img/@alt" country_parsed = xpath_parse(week_tree, country_xpath) country_cleaned = regex_strip_array(country_parsed) player_url_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@href"
print("----- ----") for h in range(start_index, end_index + 1): week = weeks_list[h][0] dateList = week.split("-") yearRanking = dateList[0] monthRanking = dateList[1] dayRanking = dateList[2] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@data-ga-label" player_name_parsed = xpath_parse(week_tree, player_name_xpath) player_name_cleaned = regex_strip_array(player_name_parsed) move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()" move_parsed = xpath_parse(week_tree, move_xpath) move_cleaned = regex_strip_array(move_parsed) age_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='age-cell']/text()" age_parsed = xpath_parse(week_tree, age_xpath) age_cleaned = regex_strip_array(age_parsed) points_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='points-cell']/a/text()" points_parsed = xpath_parse(week_tree, points_xpath)
new_row = row1 new_row.append('profile_picture') new_rows.append(new_row) for row in readCSV: print(row[5]) new_row = row profile_url = "http://www.atptour.com" + row[11] profile_tree = html_parse_tree(profile_url) player_thumbnaul_xpath = "//meta[@name='thumbnail']/@content" player_thumbnail_parsed = xpath_parse(profile_tree, player_thumbnaul_xpath) if not player_thumbnail_parsed: player_thumbnail_parsed = [''] player_thumbnail_cleaned = regex_strip_array(player_thumbnail_parsed) new_row.append(player_thumbnail_cleaned[0]) new_rows.append(new_row) # Get profile data player_id = row[15] player_slug = row[12] first_name_xpath = "//div[@id='playerProfileHero']/div[@class='player-profile-hero-overflow']/div/div/div/div[@class='first-name']/text()" first_name_parsed = xpath_parse(profile_tree, first_name_xpath) first_name_cleaned = regex_strip_array(first_name_parsed) last_name_xpath = "//div[@id='playerProfileHero']/div[@class='player-profile-hero-overflow']/div/div/div/div[@class='last-name']/text()" last_name_parsed = xpath_parse(profile_tree, last_name_xpath) last_name_cleaned = regex_strip_array(last_name_parsed)
print "" print "Index Week" print "----- ----" #for h in xrange(index, 1): #for h in xrange(index, len(weeks_list)): for h in xrange(start_index, end_index + 1): week = weeks_list[h][0] week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000" week_tree = html_parse_tree(week_url) player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" player_count_parsed = xpath_parse(week_tree, player_count_xpath) player_count_cleaned = regex_strip_array(player_count_parsed) player_count = len(player_count_cleaned) rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()" rank_parsed = xpath_parse(week_tree, rank_xpath) rank_cleaned = regex_strip_array(rank_parsed) player_url_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@href" player_url_parsed = xpath_parse(week_tree, player_url_xpath) player_url_cleaned = regex_strip_array(player_url_parsed) move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()" move_parsed = xpath_parse(week_tree, move_xpath) move_cleaned = regex_strip_array(move_parsed) age_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='age-cell']/text()"
from functions import html_parse_tree, xpath_parse, regex_strip_array weeks_url = "http://www.atpworldtour.com/en/rankings/singles" weeks_tree = html_parse_tree(weeks_url) weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value" weeks_parsed = xpath_parse(weeks_tree, weeks_xpath) weeks_cleaned = regex_strip_array(weeks_parsed) for row in weeks_cleaned: print row