import sys from functions import tournaments, array2csv # Command line input start_year = str(sys.argv[1]) end_year = str(sys.argv[2]) # Iterate through the years and scrape tourney data print('') print('Year Tournaments') print('---- -----------') tourney_data = [] for h in range(int(start_year), int(end_year) + 1): year = str(h) tourney_data += tournaments(year) # Output to CSV filename = 'tournaments_' + start_year + '-' + end_year array2csv(tourney_data, filename)
from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv weeks_url = "http://www.atpworldtour.com/en/rankings/singles" weeks_tree = html_parse_tree(weeks_url) weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value" weeks_parsed = xpath_parse(weeks_tree, weeks_xpath) weeks_cleaned = regex_strip_array(weeks_parsed) weeks_list = [[week] for week in weeks_cleaned] # Output to CSV filename = 'weeks' array2csv(weeks_list, filename)
# Match stats URL XPath match_stats_url_xpath = tourney_match_count_xpath = "//table[contains(@class, 'day-table')]/tbody[*]/tr[*]/td[contains(@class, 'day-table-score')]/a/@href" match_stats_url_cleaned = xpath_parse(tourney_tree, match_stats_url_xpath) # Filter problematic URL's match_stats_url_suffixes = [] for foo in match_stats_url_cleaned: if foo.find('//') == -1: match_stats_url_suffixes.append(foo) # STEP 2: Parse match stats if len(match_stats_url_suffixes) > 0: # Parse match stats asynchronously match_stats_data_scrape += asynchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug) # Parse match stats synchronously #match_stats_data_scrape += synchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug) else: spacing1 = format_spacing(5, tourney_index) spacing2 = format_spacing(15, tourney_slug) print tourney_index + spacing1 + ' ' + tourney_slug + spacing2 + ' Match stats URL problems' # STEP 3: Output to CSV filename = "match_stats_" + year + "_" + start_index array2csv(match_stats_data_scrape, filename)
tourney_url_suffix_split = tourney_url_suffixes[i].split('/') tourney_slug = tourney_url_suffix_split[4] # Match stats URL XPath match_stats_url_xpath = tourney_match_count_xpath = "//table[contains(@class, 'day-table')]/tbody[*]/tr[*]/td[contains(@class, 'day-table-score')]/a/@href" match_stats_url_cleaned = xpath_parse(tourney_tree, match_stats_url_xpath) # Filter problematic URL's match_stats_url_suffixes = [] for foo in match_stats_url_cleaned: if foo.find('//') == -1: match_stats_url_suffixes.append(foo) # STEP 2: Parse match stats if len(match_stats_url_suffixes) > 0: # Parse match stats asynchronously match_stats_data_scrape += asynchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug) # Parse match stats synchronously #match_stats_data_scrape += synchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug) else: spacing1 = format_spacing(5, tourney_index) spacing2 = format_spacing(15, tourney_slug) print tourney_index + spacing1 + ' ' + tourney_slug + spacing2 + ' Match stats URL problems' # STEP 3: Output to CSV filename = "match_stats_" + year + "_" + start_index array2csv(match_stats_data_scrape, filename)
elif len(move_down_parsed) > 0: move_direction = 'down' else: move_direction = '' age = age_cleaned[i] points = int(points_parsed[i].replace(',', '')) tourneys = tourneys_parsed[i] try: name_text.encode('ascii') except UnicodeEncodeError: name_text = unidecode.unidecode(name_text) data = [ week_title, week_year, week_month, week_day, rank_text, rank_number, move, move_direction, age, points, tourneys, player_url, player_slug, name_text, country_text, player_id ] rankings.append(data) filename = 'rankings_' + str(h) + '_' + week """" with codecs.open(filename + '2.csv', 'w', encoding='utf8') as f: writer = csv.writer(f, delimiter=',') for row in rankings: writer.writerow(row) """ array2csv(rankings, filename) print(str(h) + " " + week)
for i in xrange(0 , len(tourney_urls_scrape)): if len(tourney_urls_scrape[i]) > 0: # STEP 2: Scrape tournament page match_data_scrape = [] match_urls_scrape = [] scrape_tourney_output = scrape_tourney(tourney_urls_scrape[i]) match_data_scrape = scrape_tourney_output[0] match_urls_scrape = scrape_tourney_output[1] #match_counter += len(match_data_scrape) # STEP 3: tourney_data + match_data for match in match_data_scrape: foo = tourney_data_scrape[i] + match tourney_match.append(foo) spacing_count1 = len('Order') - len(str(tourney_data_scrape[i][1])) spacing1 = '' for j in xrange(0, spacing_count1): spacing1 += ' ' spacing_count2 = 41 - len(tourney_data_scrape[i][2]) spacing2 = '' for j in xrange(0, spacing_count2): spacing2 += ' ' print year + ' ' + str(tourney_data_scrape[i][1]) + spacing1 + ' ' + tourney_data_scrape[i][2] + spacing2 + ' ' + str(len(match_data_scrape)) filename = "match_scores_" + start_year + "-" + end_year array2csv(tourney_match, filename)
checkIfEmptyReturnFirst(first_name_cleaned), checkIfEmptyReturnFirst(last_name_cleaned), rank, player_url, profile_picture, flag_code, checkIfEmptyReturnFirst(residence_cleaned), checkIfEmptyReturnFirst(birthplace_cleaned), checkIfEmptyReturnFirst(birthdate_cleaned), birth_year, birth_month, birth_day, checkIfEmptyReturnFirst(turned_pro_cleaned), checkIfEmptyReturnFirst(weight_lbs_cleaned), weight_kg_cleaned_extracted, height_ft_cleaned_extracted, height_inches, height_cm_cleaned_extracted, handedness_cleaned_extracted, backhand, checkIfEmptyReturnFirst(coach_cleaned), checkIfEmptyReturnFirst(career_high_cleaned), career_high_date_cleaned_extracted, checkIfEmptyReturnFirst(prize_money_year_cleaned), checkIfEmptyReturnFirst(prize_money_cleaned), checkIfEmptyReturnFirst(titles_year_cleaned), checkIfEmptyReturnFirst(titles_cleaned), checkIfEmptyReturnFirst(win_loss_year_cleaned), checkIfEmptyReturnFirst(win_loss_cleaned) ]) array2csv(new_rows, 'rankings_0_2019-07-01') # array2csv(profiles, 'profiles') array2csv8utf(profiles, 'profiles') end = time.time() print(end - start)
import sys from functions import tournaments, array2csv # Command line input start_year = str(sys.argv[1]) end_year = str(sys.argv[2]) # Iterate through the years and scrape tourney data print '' print 'Year Tournaments' print '---- -----------' tourney_data = [] for h in xrange(int(start_year), int(end_year) + 1): year = str(h) tourney_data += tournaments(year) # Output to CSV filename = 'tournaments_' + start_year + '-' + end_year array2csv(tourney_data, filename)
week_split = week.split('-') week_year = int(week_split[0]) week_month = int(week_split[1]) week_day = int(week_split[2]) week_title = week.replace('-','.') move = move_cleaned[i] move_up_xpath = "//table[@class='mega-table']/tbody/tr[" + str(i + 1) + "]/td[@class='move-cell']/div[@class='move-up']" move_up_parsed = xpath_parse(week_tree, move_up_xpath) move_down_xpath = "//table[@class='mega-table']/tbody/tr[" + str(i + 1) + "]/td[@class='move-cell']/div[@class='move-down']" move_down_parsed = xpath_parse(week_tree, move_down_xpath) if len(move_up_parsed) > 0: move_direction = 'up' elif len(move_down_parsed) > 0: move_direction = 'down' else: move_direction = '' age = age_cleaned[i] points = int(points_parsed[i].replace(',', '')) tourneys = tourneys_parsed[i] data = [week_title, week_year, week_month, week_day, rank_text, rank_number, move, move_direction, age, points, tourneys, player_url, player_slug, player_id] rankings.append(data) filename = 'rankings_' + str(h) + '_' + week array2csv(rankings, filename) print str(h) + " " + week