Example #1
0
import sys
from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing

# Command line input
year = str(sys.argv[1])
start_index = str(sys.argv[2])

# Setup
year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year
url_prefix = "http://www.atpworldtour.com"

# STEP 1: Parse tourney URLs
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

print ''
print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(
    tourney_count) + '\x1b[0m' + ' tournaments:'
print ''
print 'Index    Tourney slug       Matches'
print '-----    ------------       -------'

# Iterate through each tournament
match_stats_data_scrape = []
for i in xrange(int(start_index), tourney_count):

    # Parse tourney tree
    tourney_url = url_prefix + tourney_url_suffixes[i]
    tourney_tree = html_parse_tree(tourney_url)
Example #2
0
      " weeks...")

print("")
print("Index    Week")
print("-----    ----")

#for h in xrange(index, 1):
#for h in xrange(index, len(weeks_list)):
for h in range(start_index, end_index + 1):
    week = weeks_list[h][0]
    week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    player_count_parsed = xpath_parse(week_tree, player_count_xpath)
    player_count_cleaned = regex_strip_array(player_count_parsed)
    player_count = len(player_count_cleaned)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)

    player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/text()"
    player_name_parsed = xpath_parse(week_tree, player_name_xpath)
    player_name_cleaned = regex_strip_array(player_name_parsed)

    country_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='country-cell']/div/div/img/@alt"
    country_parsed = xpath_parse(week_tree, country_xpath)
    country_cleaned = regex_strip_array(country_parsed)
from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv

weeks_url = "http://www.atpworldtour.com/en/rankings/singles"
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
weeks_list = [[week] for week in weeks_cleaned]
# Output to CSV
filename = 'weeks'
array2csv(weeks_list, filename)
import sys
from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing

# Command line input
year = str(sys.argv[1])
start_index = str(sys.argv[2])

# Setup
year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year
url_prefix = "http://www.atpworldtour.com"

# STEP 1: Parse tourney URLs
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

print ''
print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(tourney_count) + '\x1b[0m' + ' tournaments:'
print ''
print 'Index    Tourney slug       Matches'
print '-----    ------------       -------'

# Iterate through each tournament
match_stats_data_scrape = []
for i in xrange(int(start_index), tourney_count):

    # Parse tourney tree
    tourney_url = url_prefix + tourney_url_suffixes[i]
    tourney_tree = html_parse_tree(tourney_url)
Example #5
0
print("Index    Week")
print("-----    ----")

for h in range(start_index, end_index + 1):
    week = weeks_list[h][0]
    dateList = week.split("-")
    yearRanking = dateList[0]
    monthRanking = dateList[1]
    dayRanking = dateList[2]

    week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)

    player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@data-ga-label"
    player_name_parsed = xpath_parse(week_tree, player_name_xpath)
    player_name_cleaned = regex_strip_array(player_name_parsed)

    move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()"
    move_parsed = xpath_parse(week_tree, move_xpath)
    move_cleaned = regex_strip_array(move_parsed)

    age_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='age-cell']/text()"
    age_parsed = xpath_parse(week_tree, age_xpath)
    age_cleaned = regex_strip_array(age_parsed)

    points_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='points-cell']/a/text()"
start = time.time()
with open('rankings_0_2019-07-01.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    row1 = next(readCSV)
    new_row = row1
    new_row.append('profile_picture')
    new_rows.append(new_row)
    for row in readCSV:
        print(row[5])

        new_row = row
        profile_url = "http://www.atptour.com" + row[11]
        profile_tree = html_parse_tree(profile_url)

        player_thumbnaul_xpath = "//meta[@name='thumbnail']/@content"
        player_thumbnail_parsed = xpath_parse(profile_tree,
                                              player_thumbnaul_xpath)
        if not player_thumbnail_parsed:
            player_thumbnail_parsed = ['']
        player_thumbnail_cleaned = regex_strip_array(player_thumbnail_parsed)
        new_row.append(player_thumbnail_cleaned[0])
        new_rows.append(new_row)

        # Get profile data
        player_id = row[15]

        player_slug = row[12]

        first_name_xpath = "//div[@id='playerProfileHero']/div[@class='player-profile-hero-overflow']/div/div/div/div[@class='first-name']/text()"
        first_name_parsed = xpath_parse(profile_tree, first_name_xpath)
        first_name_cleaned = regex_strip_array(first_name_parsed)
print "Collecting weekly rankings data from " + str(len(weeks_list)) + " weeks..."

print ""
print "Index    Week"
print "-----    ----"

#for h in xrange(index, 1):
#for h in xrange(index, len(weeks_list)):
for h in xrange(start_index, end_index + 1):
    week = weeks_list[h][0]
    week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    player_count_parsed = xpath_parse(week_tree, player_count_xpath)
    player_count_cleaned = regex_strip_array(player_count_parsed)
    player_count = len(player_count_cleaned)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)   

    player_url_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@href"
    player_url_parsed = xpath_parse(week_tree, player_url_xpath)
    player_url_cleaned = regex_strip_array(player_url_parsed)

    move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()"
    move_parsed = xpath_parse(week_tree, move_xpath)
    move_cleaned = regex_strip_array(move_parsed)
from functions import html_parse_tree, xpath_parse, regex_strip_array

weeks_url = "http://www.atpworldtour.com/en/rankings/singles"
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
for row in weeks_cleaned:
    print row