コード例 #1
0
from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv

weeks_url = "http://www.atpworldtour.com/en/rankings/singles"
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
weeks_list = [[week] for week in weeks_cleaned]
# Output to CSV
filename = 'weeks'
array2csv(weeks_list, filename)
コード例 #2
0
import sys
from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing

# Command line input
year = str(sys.argv[1])
start_index = str(sys.argv[2])

# Setup
year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year
url_prefix = "http://www.atpworldtour.com"

# STEP 1: Parse tourney URLs
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

print ''
print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(
    tourney_count) + '\x1b[0m' + ' tournaments:'
print ''
print 'Index    Tourney slug       Matches'
print '-----    ------------       -------'

# Iterate through each tournament
match_stats_data_scrape = []
for i in xrange(int(start_index), tourney_count):

    # Parse tourney tree
    tourney_url = url_prefix + tourney_url_suffixes[i]
    tourney_tree = html_parse_tree(tourney_url)
コード例 #3
0
print("")
print("Collecting weekly rankings data from " + str(len(weeks_list)) +
      " weeks...")

print("")
print("Index    Week")
print("-----    ----")

#for h in xrange(index, 1):
#for h in xrange(index, len(weeks_list)):
for h in range(start_index, end_index + 1):
    week = weeks_list[h][0]
    week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    player_count_parsed = xpath_parse(week_tree, player_count_xpath)
    player_count_cleaned = regex_strip_array(player_count_parsed)
    player_count = len(player_count_cleaned)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)

    player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/text()"
    player_name_parsed = xpath_parse(week_tree, player_name_xpath)
    player_name_cleaned = regex_strip_array(player_name_parsed)

    country_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='country-cell']/div/div/img/@alt"
コード例 #4
0
import sys
from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing

# Command line input
year = str(sys.argv[1])
start_index = str(sys.argv[2])

# Setup
year_url = "http://www.atpworldtour.com/en/scores/results-archive?year=" + year
url_prefix = "http://www.atpworldtour.com"

# STEP 1: Parse tourney URLs
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

print ''
print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(tourney_count) + '\x1b[0m' + ' tournaments:'
print ''
print 'Index    Tourney slug       Matches'
print '-----    ------------       -------'

# Iterate through each tournament
match_stats_data_scrape = []
for i in xrange(int(start_index), tourney_count):

    # Parse tourney tree
    tourney_url = url_prefix + tourney_url_suffixes[i]
    tourney_tree = html_parse_tree(tourney_url)
コード例 #5
0
    'win_loss_year', 'win_loss'
]]

start = time.time()
with open('rankings_0_2019-07-01.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    row1 = next(readCSV)
    new_row = row1
    new_row.append('profile_picture')
    new_rows.append(new_row)
    for row in readCSV:
        print(row[5])

        new_row = row
        profile_url = "http://www.atptour.com" + row[11]
        profile_tree = html_parse_tree(profile_url)

        player_thumbnaul_xpath = "//meta[@name='thumbnail']/@content"
        player_thumbnail_parsed = xpath_parse(profile_tree,
                                              player_thumbnaul_xpath)
        if not player_thumbnail_parsed:
            player_thumbnail_parsed = ['']
        player_thumbnail_cleaned = regex_strip_array(player_thumbnail_parsed)
        new_row.append(player_thumbnail_cleaned[0])
        new_rows.append(new_row)

        # Get profile data
        player_id = row[15]

        player_slug = row[12]
コード例 #6
0
read_csv(weeks_list, csv_file)

print ""
print "Collecting weekly rankings data from " + str(len(weeks_list)) + " weeks..."

print ""
print "Index    Week"
print "-----    ----"

#for h in xrange(index, 1):
#for h in xrange(index, len(weeks_list)):
for h in xrange(start_index, end_index + 1):
    week = weeks_list[h][0]
    week_url = "http://www.atpworldtour.com/en/rankings/singles?rankDate=" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    player_count_parsed = xpath_parse(week_tree, player_count_xpath)
    player_count_cleaned = regex_strip_array(player_count_parsed)
    player_count = len(player_count_cleaned)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)   

    player_url_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/@href"
    player_url_parsed = xpath_parse(week_tree, player_url_xpath)
    player_url_cleaned = regex_strip_array(player_url_parsed)

    move_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='move-cell']/div[@class='move-text']/text()"
コード例 #7
0
from functions import html_parse_tree, xpath_parse, regex_strip_array

weeks_url = "http://www.atpworldtour.com/en/rankings/singles"
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
for row in weeks_cleaned:
    print row