from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv

weeks_url = ""
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
weeks_list = [[week] for week in weeks_cleaned]
# Output to CSV
filename = 'weeks'
array2csv(weeks_list, filename)
Ejemplo n.º 2
import sys
from functions import html_parse_tree, xpath_parse, synchronous, asynchronous, scrape_match_stats, array2csv, format_spacing

# Command line input
year = str(sys.argv[1])
start_index = str(sys.argv[2])

# Setup
year_url = "" + year
url_prefix = ""

# STEP 1: Parse tourney URLs
year_tree = html_parse_tree(year_url)
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][*]/td[8]/a/@href"
tourney_url_suffixes = xpath_parse(year_tree, tourney_details_url_xpath)
tourney_count = len(tourney_url_suffixes)

print ''
print 'Collecting match stats data for ' + '\x1b[0;32;40m' + str(
    tourney_count) + '\x1b[0m' + ' tournaments:'
print ''
print 'Index    Tourney slug       Matches'
print '-----    ------------       -------'

# Iterate through each tournament
match_stats_data_scrape = []
for i in xrange(int(start_index), tourney_count):

    # Parse tourney tree
    tourney_url = url_prefix + tourney_url_suffixes[i]
    tourney_tree = html_parse_tree(tourney_url)
Ejemplo n.º 3
print("Collecting weekly rankings data from " + str(len(weeks_list)) +
      " weeks...")

print("Index    Week")
print("-----    ----")

#for h in xrange(index, 1):
#for h in xrange(index, len(weeks_list)):
for h in range(start_index, end_index + 1):
    week = weeks_list[h][0]
    week_url = "" + week + "&rankRange=1-3000"

    week_tree = html_parse_tree(week_url)

    player_count_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    player_count_parsed = xpath_parse(week_tree, player_count_xpath)
    player_count_cleaned = regex_strip_array(player_count_parsed)
    player_count = len(player_count_cleaned)

    rank_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='rank-cell']/text()"
    rank_parsed = xpath_parse(week_tree, rank_xpath)
    rank_cleaned = regex_strip_array(rank_parsed)

    player_name_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='player-cell']/a/text()"
    player_name_parsed = xpath_parse(week_tree, player_name_xpath)
    player_name_cleaned = regex_strip_array(player_name_parsed)

    country_xpath = "//table[@class='mega-table']/tbody/tr/td[@class='country-cell']/div/div/img/@alt"
    'win_loss_year', 'win_loss'

start = time.time()
with open('rankings_0_2019-07-01.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    row1 = next(readCSV)
    new_row = row1
    for row in readCSV:

        new_row = row
        profile_url = "" + row[11]
        profile_tree = html_parse_tree(profile_url)

        player_thumbnaul_xpath = "//meta[@name='thumbnail']/@content"
        player_thumbnail_parsed = xpath_parse(profile_tree,
        if not player_thumbnail_parsed:
            player_thumbnail_parsed = ['']
        player_thumbnail_cleaned = regex_strip_array(player_thumbnail_parsed)

        # Get profile data
        player_id = row[15]

        player_slug = row[12]
read_csv(weeks_list, csv_file)

from functions import html_parse_tree, xpath_parse, regex_strip_array

weeks_url = ""
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
for row in weeks_cleaned:
    print row