def __init__(self, tournament_db): """For uploading SG collection objects to MongoDB""" self._tournament_db = tournament_db self._logger = MyLogger('MongoDB SG', 'tournaments/SG/logs/sg_mongodb.log', logging.INFO).getLogger() self._sg_stats_upload = 0 self._sg_stats_overall = 0
def __init__(self, mongo_obj, tournament_df, raw_sg_df, distances_df=None): self._sg_df_dict = {} self._logger = MyLogger('sgHandler', 'Analysis/logs/sgHandler.log', logging.INFO).getLogger() self._tournament_df = tournament_df self._raw_sg_df = raw_sg_df self._mongo_obj = mongo_obj self._distances_df = distances_df
class MongoInitialization: def __init__(self, called_from): """For connecting and set up to MongoDB""" self.connection_str = '{}'.format(MY_MONGO_DB_KEY) self._logger = MyLogger('MongoDB', 'MongoDB/logs/mongodb.log', logging.INFO).getLogger() self._logger.info('Connecting to MongoDB...\n') self._client = pymongo.MongoClient(self.connection_str) self._tournament_db = self._client.tournament_db self._logger.info('Client description {}\n'.format(self._client)) self._logger.info('Tournament DB description {}\n'.format( self._tournament_db)) col_names = self._tournament_db.collection_names() self._logger.info( 'TournamentDB has the following collections {}\n'.format( col_names)) if called_from == 'scraper': self._createCollection('tournament_detail', [('tournamentID', 1), ('pgaYear', -1)]) self._createCollection('player_metadata', [('playerID', 1)]) self._createCollection('player_round', [('playerID', 1), ('tournamentID', 1), ('pgaYear', -1), ('roundNumber', 1)]) self._createCollection('course_metadata', [('courseID', 1), ('tournamentID', 1), ('pgaYear', -1)]) self._createCollection('tournament_scrape_status', [('tournamentName', 1), ('pgaYear', -1)]) elif called_from == 'sg': self._createCollection('sg_stats', [('tournamentName', 1), ('pgaYear', -1), ('playerName', 1)]) elif called_from == 'df': self._createCollection('tournament_df', [('tournamentName', 1), ('courseID', 1), ('pgaYear', -1), ('roundNum', 1)]) self._createCollection('raw_sg_df', [('tournamentName', 1), ('pgaYear', -1)]) def _createCollection(self, collection_name, index_dict): if collection_name not in self._tournament_db.collection_names(): new_col = self._tournament_db[collection_name] idx = new_col.create_index(index_dict, unique=True) self._logger.info('Created {} Collection with index {}\n'.format( collection_name, idx)) def __repr__(self): return 'MongoDB Client is {}\nTournament DB is {}\n'.format( self._client, self._tournament_db) def getTournamentDB(self): return self._tournament_db def getLogger(self): return self._logger
def __init__(self, tournament_db, tournament_name): """For uploading tournament DF to MongoDB""" self._tournament_db = tournament_db self._name = tournament_name self._logger = MyLogger( 'MongoDB Tournament DF {}'.format(self._name), 'tournaments/DFs/{}/logs/tournament_mongodb.log'.format( self._name), logging.INFO).getLogger() self._tournament_df_upload = False self._raw_sg_df_upload = False
def __init__(self): """Initialize SG Scraper""" self._sg_url = 'https://datagolf.com/historic-event-data' # create place holder dictionaries for data once scraped self._tournament_sg_col = [] # all I/O done in tournaments/'pga_year'_'tournament_name' directory self._file_handler = 'tournaments/SG/logs/sg_scape.log' # initialize logger self._logger = MyLogger(self.__class__.__name__, self._file_handler, logging.INFO, 'w').getLogger() # initialize driver self.web_driver = WebDriver(self._logger) self.year_options = None
def __init__(self, tournament_db, tournament_year, tournament_name): """For uploading tournament scrape collection objects to MongoDB""" self._tournament_db = tournament_db self._year = tournament_year self._name = tournament_name self._logger = MyLogger( 'MongoDB {} {}'.format(self._year, self._name), 'tournaments/{}_{}/logs/tournament_mongodb.log'.format( self._year, self._name), logging.INFO).getLogger() self._tournament_detail_upload = False self._player_metadata_upload = 0 self._player_metadata_overall = 0 self._player_round_upload = 0 self._player_round_overall = 0 self._course_metadata_upload = 0 self._course_metadata_overall = 0 self._tournament_scrape_status_upload = False self._sg_stats_upload = False
class MongoUploadSG: def __init__(self, tournament_db): """For uploading SG collection objects to MongoDB""" self._tournament_db = tournament_db self._logger = MyLogger('MongoDB SG', 'tournaments/SG/logs/sg_mongodb.log', logging.INFO).getLogger() self._sg_stats_upload = 0 self._sg_stats_overall = 0 def __repr__(self): return 'MongoDB SG Upload Status: {}'.format(self._getUploadStatus()) def uploadSGStats(self, sg_stats_list): for sg_stats in sg_stats_list: self._sg_stats_overall += 1 result = self._tournament_db.sg_stats.replace_one( { 'playerName': sg_stats['playerName'], 'tournamentName': sg_stats['tournamentName'], 'pgaYear': sg_stats['pgaYear'] }, sg_stats, upsert=True) if result is not None: if result.upserted_id is not None: self._logger.info( 'Inserted SG stats into collection with id {}\n'. format(result.upserted_id)) else: self._logger.info( 'Updated existing sg stats with key {}\n'.format({ 'playerName': sg_stats['playerName'], 'tournamentName': sg_stats['tournamentName'], 'pgaYear': sg_stats['pgaYear'] })) self._sg_stats_upload += 1 def _getUploadStatus(self): return 'SG Stats Uploaded: {} of {} possible\n'.format( self._sg_stats_upload, self._sg_stats_overall)
def __init__(self, pga_tournament, pga_year, driver=None): """Initialize scraper with tournament, year, optional logger name, wire requests dict, web driver""" self._pga_tournament = pga_tournament self._pga_year = pga_year self._tournament_url = 'https://www.pgatour.com/competition/' + pga_year + '/' + pga_tournament + \ '/leaderboard.html' self.successfully_scraped = 0 self.tournament_id = None # create place holder dictionaries for data once scraped self._course_ids = set() self._tournament_info_dict = {} self._player_meta_dict = {} self._course_general_dict = {} self._course_meta_dict = {} self._player_round_dict = {} self._unsuccessful_player_round_scrape = {} self._course_requests = {} self._row_dict = {} # use this default dictionary as template for wire requests self.template_wire_html_dict = { 'tournament_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/leaderboard.json', 'course_general': 'https://statdata.pgatour.com/r/TOURNAMENT_ID/course.json', 'course_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/courseC_ID', 'round_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/drawer/rROUND_NUM-mMAIN_PLAYER_ID' } # all I/O done in tournaments/'pga_year'_'tournament_name' directory self.dir = 'tournaments/' + self._pga_year + '_' + self._pga_tournament + '/' self._file_handler = self.dir + 'logs/tournament_scape.log' # initialize logger self._logger = MyLogger(self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler, logging.INFO, 'a').getLogger() # initialize driver if driver is None: self.web_driver = WebDriver(self._logger) else: self.web_driver = driver self.web_driver.updateLogLocations(' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler)
def __init__(self, mongo_obj, tournament_name_scrape, tournament_name_sg, force_create_sg=False, force_create_tournament=False): self._logger = MyLogger('dfHandler', 'Analysis/logs/dfHandler.log', logging.INFO).getLogger() self._tournament_name = tournament_name_scrape self._mongo_obj = mongo_obj mongo_download = MongoDownload(self._mongo_obj) self._mongo_upload_df = MongoUploadDF(self._mongo_obj.getTournamentDB(), self._tournament_name) self._raw_sg_df = pd.DataFrame(mongo_download.getRawSG_DF(tournament_name_scrape)) if self._raw_sg_df.empty or force_create_sg: self._logger.info('Creating New Raw SG DF') self._raw_sg_df = pd.DataFrame() self._createRawSG_DF(mongo_download.getSGStatsForTournament(tournament_name_scrape, tournament_name_sg), mongo_download.getPlayerNames()) self._tournament_df = pd.DataFrame(mongo_download.getTournamentDF(tournament_name_scrape)) if self._tournament_df.empty or force_create_tournament: self._logger.info('Creating New Tournament DF') self._tournament_df = pd.DataFrame() self._createTournamentDF(mongo_download.consolidateTournamentInfo(tournament_name_scrape), mongo_download.getPlayerNames())
def __init__(self, called_from): """For connecting and set up to MongoDB""" self.connection_str = '{}'.format(MY_MONGO_DB_KEY) self._logger = MyLogger('MongoDB', 'MongoDB/logs/mongodb.log', logging.INFO).getLogger() self._logger.info('Connecting to MongoDB...\n') self._client = pymongo.MongoClient(self.connection_str) self._tournament_db = self._client.tournament_db self._logger.info('Client description {}\n'.format(self._client)) self._logger.info('Tournament DB description {}\n'.format( self._tournament_db)) col_names = self._tournament_db.collection_names() self._logger.info( 'TournamentDB has the following collections {}\n'.format( col_names)) if called_from == 'scraper': self._createCollection('tournament_detail', [('tournamentID', 1), ('pgaYear', -1)]) self._createCollection('player_metadata', [('playerID', 1)]) self._createCollection('player_round', [('playerID', 1), ('tournamentID', 1), ('pgaYear', -1), ('roundNumber', 1)]) self._createCollection('course_metadata', [('courseID', 1), ('tournamentID', 1), ('pgaYear', -1)]) self._createCollection('tournament_scrape_status', [('tournamentName', 1), ('pgaYear', -1)]) elif called_from == 'sg': self._createCollection('sg_stats', [('tournamentName', 1), ('pgaYear', -1), ('playerName', 1)]) elif called_from == 'df': self._createCollection('tournament_df', [('tournamentName', 1), ('courseID', 1), ('pgaYear', -1), ('roundNum', 1)]) self._createCollection('raw_sg_df', [('tournamentName', 1), ('pgaYear', -1)])
class TournamentScraper: """Given a tournament and year, this scrapes pgatour.com tournament result page to create json files containing data on tournament info and player course_hole by course_hole shots""" def __init__(self, pga_tournament, pga_year, driver=None): """Initialize scraper with tournament, year, optional logger name, wire requests dict, web driver""" self._pga_tournament = pga_tournament self._pga_year = pga_year self._tournament_url = 'https://www.pgatour.com/competition/' + pga_year + '/' + pga_tournament + \ '/leaderboard.html' self.successfully_scraped = 0 self.tournament_id = None # create place holder dictionaries for data once scraped self._course_ids = set() self._tournament_info_dict = {} self._player_meta_dict = {} self._course_general_dict = {} self._course_meta_dict = {} self._player_round_dict = {} self._unsuccessful_player_round_scrape = {} self._course_requests = {} self._row_dict = {} # use this default dictionary as template for wire requests self.template_wire_html_dict = { 'tournament_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/leaderboard.json', 'course_general': 'https://statdata.pgatour.com/r/TOURNAMENT_ID/course.json', 'course_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/courseC_ID', 'round_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/drawer/rROUND_NUM-mMAIN_PLAYER_ID' } # all I/O done in tournaments/'pga_year'_'tournament_name' directory self.dir = 'tournaments/' + self._pga_year + '_' + self._pga_tournament + '/' self._file_handler = self.dir + 'logs/tournament_scape.log' # initialize logger self._logger = MyLogger(self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler, logging.INFO, 'a').getLogger() # initialize driver if driver is None: self.web_driver = WebDriver(self._logger) else: self.web_driver = driver self.web_driver.updateLogLocations(' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler) def __repr__(self): """Print Scraper Class with year, tournament and scraped status""" return (self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament + '\nScrape Status: Scraped {:.2f}% of potential data'.format(self.successfully_scraped)) def _scrapeTournamentJSON(self, tournament_detail_json): """Insert into dictionaries from the detailed tournament info JSON""" # make sure pga years match if self._pga_year != findKeyInJSON(tournament_detail_json, 'year'): self._logger.warning('Error: Non-matching PGA years. User Input {}; JSON {}' .format(self._pga_year, findKeyInJSON(tournament_detail_json, 'year'))) # cut line data cut_line_info = findKeyInJSON(tournament_detail_json, 'cutLines') cut_dict = {'cuts': []} for i, cut in enumerate(cut_line_info, start=1): cut_dict['cuts'].append({ 'cutNumber': i, 'cutCount': cut['cut_count'], 'cutScore': cut['cut_line_score'], 'cutPaidCount': cut['paid_players_making_cut'] }) self._tournament_info_dict.update(cut_dict) # all other tournament data self._tournament_info_dict.update({ 'tournamentID': self.tournament_id, 'tournamentName': self._pga_tournament, 'multiCourse': findKeyInJSON(tournament_detail_json, 'multiCourse'), 'totalRounds': findKeyInJSON(tournament_detail_json, 'totalRounds'), 'format': findKeyInJSON(tournament_detail_json, 'format'), 'pgaYear': findKeyInJSON(tournament_detail_json, 'year'), 'status': findKeyInJSON(tournament_detail_json, 'roundState'), 'playoff': findKeyInJSON(tournament_detail_json, 'playoffPresent'), 'dates': self.web_driver.findElementByXPath('.//span[@class = "dates"]'), 'location': self.web_driver.findElementByXPath('.//span[@class = "name"]') }) # create player name dictionary player_rows = findKeyInJSON(tournament_detail_json, 'rows') for row in player_rows: self._player_meta_dict[row['playerId']] = {} self._player_meta_dict[row['playerId']]['firstName'] = row['playerNames']['firstName'] self._player_meta_dict[row['playerId']]['lastName'] = row['playerNames']['lastName'] def _scrapeCourseGeneral(self, course_general_json): """Insert into dictionaries from the general course information JSON""" for course_desc in course_general_json['courses']: course_id = findKeyInJSON(course_desc, 'number') self._course_general_dict[course_id] = { 'description': findKeyInJSON(course_desc, 'body'), 'name': findKeyInJSON(course_desc, 'name'), 'totalYards': findKeyInJSON(course_desc, 'yards') } def _scrapePlayerDetail(self, main_player_id, round_num, round_detail_json): """Insert into dictionaries the data from the player round detail JSON""" """Scrape data from the player round specific JSON""" if main_player_id in self._player_round_dict and round_num in self._player_round_dict[main_player_id]: self._logger.info( 'Previously downloaded JSON for round {} from player ID {}'.format(round_num, main_player_id)) return self._logger.info('Downloading JSON from round {} for player ID {}'.format(round_num, main_player_id)) course_id = findKeyInJSON(round_detail_json, 'courseId') # only add if course hasn't been added to course ids yet if course_id not in self._course_ids: # add course to wire requests self._course_requests[course_id] = self.template_wire_html_dict['course_detail'] \ .replace('PGA_YEAR', self._pga_year) \ .replace('TOURNAMENT_ID', self.tournament_id) \ .replace('C_ID', course_id) self._course_ids.add(course_id) play_by_play = findKeyInJSON(round_detail_json, 'playersHoles') player_hole_dict = {} # get shot level data for hole in play_by_play: hole_id = hole['holeId'] for player in hole['players']: player_id = player['playerId'] if player_id not in player_hole_dict: player_hole_dict[player_id] = {} player_hole_dict[player_id][hole_id] = player['shots'] # check to see if main player id is indeed contained in json data if main_player_id not in player_hole_dict: self._logger.warning('Main Player ID is {}, player IDs in JSON File {}'.format( main_player_id, player_hole_dict.keys())) # assign shot data and create metadata for round for player_id in player_hole_dict.keys(): if player_id not in self._player_round_dict: self._player_round_dict[player_id] = {} if round_num not in self._player_round_dict[player_id]: self._player_round_dict[player_id][round_num] = {} self._player_round_dict[player_id][round_num]['play-by-play'] = player_hole_dict[player_id] self._player_round_dict[player_id][round_num]['metadata'] = { 'completedRound': findKeyInJSON(round_detail_json, 'roundComplete'), 'groupId': findKeyInJSON(round_detail_json, 'groupId'), 'startingHoleId': findKeyInJSON(round_detail_json, 'startingHoleId'), 'courseId': findKeyInJSON(round_detail_json, 'courseId'), 'playedWith': [other_id for other_id in player_hole_dict.keys() if other_id != player_id] } self._unsuccessful_player_round_scrape.pop(' '.join([main_player_id, round_num]), None) def _scrapeCourseDetail(self, c_id, course_detail_json): """Insert into dictionaries from the course detail JSON""" self._logger.info('Downloading JSON for course {}'.format(c_id)) course_id = findKeyInJSON(course_detail_json, 'courseId') # check if this is a mismatch from c_id if c_id != course_id: self._logger.warning( 'Course ID {} from course detail JSON mismatches the player round course ID {}'.format(course_id, c_id)) # check if course exists from earlier general json scrape if len(self._course_ids) > 0 and course_id not in self._course_ids: self._logger.warning( 'Course ID {} came through the wire but did not exist in the general course JSON'.format(course_id)) self._course_ids.add(course_id) hole_detail_dict = {} # course_hole by course_hole data for hole in findKeyInJSON(course_detail_json, 'holes'): round_info = {'rounds': []} for round_details in hole['rounds']: round_detail = { 'round_Id': round_details['roundId'], 'distance': round_details['distance'], 'par': round_details['par'], 'stimp': round_details.get('stimp') } round_info['rounds'].append(round_detail) hole_detail_dict[hole['holeId']] = round_info # add metadata self._course_meta_dict[course_id] = { 'courseCode': findKeyInJSON(course_detail_json, 'courseCode'), 'parIn': findKeyInJSON(course_detail_json, 'parIn'), 'parOut': findKeyInJSON(course_detail_json, 'parOut'), 'parTotal': findKeyInJSON(course_detail_json, 'parTotal'), 'holes': hole_detail_dict } # add data from course general dict if exists if course_id in self._course_general_dict: self._course_meta_dict[course_id].update(self._course_general_dict[course_id]) def _getTournamentJSON(self, req_str): """Get tournament details from the JSON request string, rerun scrape if this isn't working""" tournament_detail_json = self.web_driver.wireRequestToJSON(req_str) if tournament_detail_json: self._scrapeTournamentJSON(tournament_detail_json) return True else: return False def _getCourseGeneralJSON(self, req_str): """Get course general details from the JSON request string""" course_general_json = self.web_driver.wireRequestToJSON(req_str) if course_general_json: self._scrapeCourseGeneral(course_general_json) def _getPlayerLevelJSON(self, req_str, main_player_id, round_num): """Get player level details from the JSON request string""" round_detail_json = self.web_driver.wireRequestToJSON(req_str) if round_detail_json: self._scrapePlayerDetail(main_player_id, round_num, round_detail_json) return True else: return False def _getCourseDetailJSON(self): """Get course details from the JSON request string""" for c_id, req_str in self._course_requests.items(): course_detail_json = self.web_driver.wireRequestToJSON(req_str) if course_detail_json: self._scrapeCourseDetail(c_id, course_detail_json) def _getTournamentID(self): """Get tournament ID from Xpath""" tournament_xpath = self.web_driver.webDriverWait(self.web_driver.getDriver(), EC.presence_of_element_located( (By.XPATH, "//meta[@name='branch:deeplink:tournament_id']")), 'Error getting tournament_id\n{}') if tournament_xpath is None: self._logger.error('Could not get a tournament ID out of {}\n'.format(tournament_xpath)) return False self.tournament_id = re.findall(r'\d+', tournament_xpath.get_attribute('content'))[0] if not self.tournament_id: self._logger.error('Could not get a tournament ID out of string {}\n'.format(self.tournament_id)) return False self._logger.info('Tournament ID is {}'.format(self.tournament_id)) return True def _scrapeThroughPlayerRow(self, row): """Each player row will need to be clicked and then each round will need to show play by play data""" player_reqs = [] # get player's shot information chart open on url _ = row.location_once_scrolled_into_view main_player_id = re.findall(r'\d+', row.get_attribute('class'))[0] player_name_col_button = self.web_driver.webDriverWait(row, EC.element_to_be_clickable( (By.CLASS_NAME, 'player-name-col')), 'Error getting player column to click\n{}') if player_name_col_button is None: return player_reqs _ = player_name_col_button.location_once_scrolled_into_view player_name_col_button.click() # get the player drawer that opens player_drawer = self.web_driver.webDriverWait(row.parent, EC.visibility_of_element_located( (By.ID, 'playerDrawer{}'.format(main_player_id))), 'Error getting player drawer\n{}') if player_drawer is None: return player_reqs # get round by round data by clicking player round buttons round_selector = self.web_driver.webDriverWait(player_drawer, EC.visibility_of_element_located( (By.CLASS_NAME, 'round-selector')), 'Error getting round selector\n{}') if round_selector is None: return player_reqs last_round = round_selector.find_element_by_class_name('round.active').text self.web_driver.webDriverWait(round_selector, EC.element_to_be_clickable( (By.CLASS_NAME, 'round')), 'Error getting round button to click\n{}') rounds = round_selector.find_elements_by_class_name('round') # go round by round to scrape data for round_button in rounds: round_num = round_button.text if main_player_id in self._player_round_dict and round_num in self._player_round_dict[main_player_id]: self._logger.info( 'Previously scraped data for round {} from player ID {}'.format(round_num, main_player_id)) continue self._logger.info('Getting JSON wire for round {} from player ID {}'.format(round_num, main_player_id)) player_reqs.append( {'PlayerID': main_player_id, 'RoundNum': round_num, 'Wire': self.template_wire_html_dict['round_detail'] .replace('PGA_YEAR', self._pga_year) .replace('TOURNAMENT_ID', self.tournament_id) .replace('ROUND_NUM', round_num) .replace('MAIN_PLAYER_ID', main_player_id)}) if round_num != last_round: self.web_driver.getDriver().implicitly_wait(.1) round_button.click() # this closes the player's shot information chart # player_name_col_button.click() return player_reqs def _checkScrapeResults(self): """After getting all JSON and converting to dictionaries, check to see how we did""" if len(self._player_round_dict) == len(self._player_meta_dict): self.successfully_scraped = 100 self._logger.info('Successfully scraped data for all players in tournament {} {}' .format(self._pga_year, self._pga_tournament)) elif len(self._player_round_dict) == 0: self._logger.info( 'Unsuccessfully scraped data for tournament {} {}'.format(self._pga_year, self._pga_tournament)) elif len(self._player_round_dict) < len(self._player_meta_dict): self.successfully_scraped = (len(self._player_round_dict) / len(self._player_meta_dict)) * 100 self._logger.info('Only scraped data for {:.2f}% of players in tournament {} {}'. format(self.successfully_scraped, self._pga_year, self._pga_tournament)) self._logger.info( 'Player rows unsuccessfully scraped are:\n{}'.format(self._unsuccessful_player_round_scrape.keys())) def runScrape(self): """Main function for running the scrape, get all necessary info from the page, iterate through players shot charts, try to scrape as much as possible from the JSON requests.""" self._logger.info( '\nRunning Scrape for {} {}\nURL is {}\n'.format(self._pga_year, self._pga_tournament, self._tournament_url)) self.web_driver.goToURL(self._tournament_url) if not self._getTournamentID(): return False row_lines = self.web_driver.webDriverWait(self.web_driver.getDriver(), EC.visibility_of_all_elements_located( (By.CSS_SELECTOR, 'tr.line-row.line-row')), 'Error locating player elements on page\n{}') if row_lines is None: return False # request string for tournament detail tournament_req_str = self.template_wire_html_dict['tournament_detail'].replace( 'PGA_YEAR', self._pga_year).replace('TOURNAMENT_ID', self.tournament_id) # scrape JSON of tournament detail if not self._getTournamentJSON(tournament_req_str): self._logger.error('Failed getting tournament details.') return False # request string for course general info course_gen_req_str = self.template_wire_html_dict['course_general'].replace( 'TOURNAMENT_ID', self.tournament_id) # scrape JSON of course general self._getCourseGeneralJSON(course_gen_req_str) successive_failures = 0 # split up player JSON requests because some data overlaps in the play by play JSON for i in range(3): remove_rows = [] # run first time through and keep track of unsuccessful scrapes for row_num, row in enumerate(row_lines[i::3]): row_num = i + (row_num * 3) # if row_num > 9: # continue if row_num not in self._row_dict: self._logger.info('Iterating over row {}'.format(row_num)) self._row_dict[row_num] = self._scrapeThroughPlayerRow(row) for row_num, player_requests in self._row_dict.items(): for request in player_requests: req_str = request['Wire'] main_player_id = request['PlayerID'] round_num = request['RoundNum'] if not self._getPlayerLevelJSON(req_str, main_player_id, round_num): self._unsuccessful_player_round_scrape[' '.join([main_player_id, round_num])] = req_str self._logger.warning( 'Unsuccessfully retrieved JSON for player ID {} -- round ' 'number {}. Will retry this row later.\n'.format(main_player_id, round_num)) successive_failures += 1 break else: successive_failures = 0 else: remove_rows.append(row_num) # Something's wrong if successive_failures > 5: self._logger.warn( 'Had 5 successive failures while getting player round JSON, exiting scrape') return False # remove successful rows for row_num in remove_rows: del self._row_dict[row_num] # can get course detail data once all players have been added with the courses they played self._getCourseDetailJSON() # run through a second time with all the rows that were unsuccessful at first for row_num in self._row_dict.keys(): self._logger.info('Iterating over row {}'.format(row_num)) self._row_dict[row_num] = self._scrapeThroughPlayerRow(row_lines[row_num]) for row_num, player_requests in self._row_dict.items(): for request in player_requests: req_str = request['Wire'] main_player_id = request['PlayerID'] round_num = request['RoundNum'] if not self._getPlayerLevelJSON(req_str, main_player_id, round_num): self._logger.warning( 'Unsuccessfully retrieved JSON for player ID {} -- round ' 'number {} Final attempt.\n'.format(main_player_id, round_num)) self._checkScrapeResults() return True def __convertPlayerRoundToMongoDBCollection(self): player_round_collection = [] for player_id, round_num in self._player_round_dict.items(): for round_key, round_values in round_num.items(): player_round_level = {'playerID': player_id, 'roundNumber': round_key, 'tournamentID': self.tournament_id, 'pgaYear': self._pga_year} player_round_level.update(round_values['metadata']) player_round_level['holes'] = [] for hole_key, hole_values in round_values['play-by-play'].items(): hole_level = {'holeNumber': hole_key, 'shots': []} for shot in hole_values: hole_level['shots'].append(shot) player_round_level['holes'].append(hole_level) player_round_collection.append(player_round_level) return player_round_collection def __convertPlayerMetaToMongoDBCollection(self): player_meta_collection = [] for player_id, meta_values in self._player_meta_dict.items(): player_meta = {'playerID': player_id} player_meta.update(meta_values) player_meta_collection.append(player_meta) return player_meta_collection def __convertCourseMetaToMongoDBCollection(self): course_meta_collection = [] for course_id, course_details in self._course_meta_dict.items(): course_meta = {'courseID': course_id, 'pgaYear': self._pga_year, 'tournamentID': self.tournament_id} course_meta.update(course_details) hole_level_list = [] for hole_key, round_info in course_meta['holes'].items(): hole_level = {'holeNumber': hole_key} hole_level.update(round_info) hole_level_list.append(hole_level) course_meta['holes'] = hole_level_list course_meta_collection.append(course_meta) return course_meta_collection def convertDictsToMongoDBCollection(self): """General method for converting all class dictionaries to MongoDB Collections""" mongoDB_collections = [self.__convertPlayerRoundToMongoDBCollection(), self.__convertPlayerMetaToMongoDBCollection(), self.__convertCourseMetaToMongoDBCollection(), self._tournament_info_dict] return mongoDB_collections def uploadDictsToJSON(self): """Upload the dictionaries to json files for debugging purposes""" with open(self.dir + 'player_round.json', 'w') as f: json.dump(self._player_round_dict, f) with open(self.dir + 'player_meta.json', 'w') as f: json.dump(self._player_meta_dict, f) with open(self.dir + 'tournament_info.json', 'w') as f: json.dump(self._tournament_info_dict, f) with open(self.dir + 'course_meta.json', 'w') as f: json.dump(self._course_meta_dict, f) def downloadDictsFromJSON(self): """Download the JSON files to dictionaries for debugging purposes""" with open(self.dir + 'player_round.json', 'r') as f: self._player_round_dict = json.load(f) with open(self.dir + 'player_meta.json', 'r') as f: self._player_meta_dict = json.load(f) with open(self.dir + 'tournament_info.json', 'r') as f: self._tournament_info_dict = json.load(f) with open(self.dir + 'course_meta.json', 'r') as f: self._course_meta_dict = json.load(f)
import itertools import logging import pandas as pd from Logging.MyLogger import MyLogger from MongoDB.MongoDownload import MongoDownload from MongoDB.MongoInitialization import MongoInitialization from TournamentRun import TournamentRun # tournaments_path = 'tournaments/FailedTournamentList.csv' tournaments_path = 'tournaments/TournamentList.csv' if __name__ == '__main__': max_drivers = 2 main_logger = MyLogger('Main', 'Main/logs/main.log', logging.INFO).getLogger() mongo_obj = MongoInitialization('scraper') tournament_df = pd.read_csv(tournaments_path, delimiter=',') tournament_df.columns = tournament_df.columns.str.strip() tournament_df['Name'] = tournament_df['Name'].str.strip() mongo_download = MongoDownload(mongo_obj) tournaments_scraped = mongo_download.getTournamentsScraped() filter_tournaments = tournament_df[~tournament_df[['Name', 'Year']].apply(tuple, 1).isin(tournaments_scraped)] tournaments = filter_tournaments.apply(lambda row: TournamentRun(row[0], row[1], mongo_obj, main_logger), axis=1).tolist() iter_tournaments = iter(tournaments) with concurrent.futures.ThreadPoolExecutor(max_workers=max_drivers) as executor: # Only schedule max_drivers amount of futures to start futures = { executor.submit(tournament.runTournament, None, True): tournament
class MongoUploadDF: def __init__(self, tournament_db, tournament_name): """For uploading tournament DF to MongoDB""" self._tournament_db = tournament_db self._name = tournament_name self._logger = MyLogger( 'MongoDB Tournament DF {}'.format(self._name), 'tournaments/DFs/{}/logs/tournament_mongodb.log'.format( self._name), logging.INFO).getLogger() self._tournament_df_upload = False self._raw_sg_df_upload = False def __repr__(self): return 'MongoDB DF Upload Status: {}'.format(self._getUploadStatus()) def uploadTournamentDF(self, upload_dict): try: tournament_name = upload_dict['tournamentName'] pga_year = upload_dict['pgaYear'] course_id = upload_dict['courseID'] round_num = upload_dict['roundNum'] self._logger.info( 'Attempting to upload {} {}, course {}, round #{}'.format( pga_year, tournament_name, course_id, round_num)) query = { 'tournamentName': tournament_name, 'courseID': course_id, 'pgaYear': pga_year, 'roundNum': round_num } values = {'$set': upload_dict} result = self._tournament_db.tournament_df.update(query, values, upsert=True) if result is not None: if not result['updatedExisting']: self._logger.info( 'Inserted Tournament DF into collection with id {}\n'. format(result['upserted'])) else: self._logger.info( 'Updated existing Tournament DF with key {}\n'.format({ 'tournamentName': tournament_name, 'courseID': course_id, 'pgaYear': pga_year, 'roundNum': round_num })) except Exception as e: self._logger.error('Problem uploading DF {}'.format(e), exc_info=True) else: self._tournament_df_upload = True def uploadRawSG_DF(self, upload_dict): try: tournament_name = upload_dict['tournamentName'] pga_year = upload_dict['pgaYear'] self._logger.info('Attempting to upload {} {}'.format( pga_year, tournament_name)) query = {'tournamentName': tournament_name, 'pgaYear': pga_year} values = {'$set': upload_dict} result = self._tournament_db.raw_sg_df.update(query, values, upsert=True) if result is not None: if not result['updatedExisting']: self._logger.info( 'Inserted Raw SG DF into collection with id {}\n'. format(result['upserted'])) else: self._logger.info( 'Updated existing Raw SG DF with key {}\n'.format({ 'tournamentName': tournament_name, 'pgaYear': pga_year })) except Exception as e: self._logger.error('Problem uploading DF {}'.format(e), exc_info=True) else: self._raw_sg_df_upload = True def _getUploadStatus(self): return 'Tournament DF Upload: {}\nRaw SG DF Upload: {}\n'.format( self._tournament_df_upload, self._raw_sg_df_upload)
class MongoUploadTournament: def __init__(self, tournament_db, tournament_year, tournament_name): """For uploading tournament scrape collection objects to MongoDB""" self._tournament_db = tournament_db self._year = tournament_year self._name = tournament_name self._logger = MyLogger( 'MongoDB {} {}'.format(self._year, self._name), 'tournaments/{}_{}/logs/tournament_mongodb.log'.format( self._year, self._name), logging.INFO).getLogger() self._tournament_detail_upload = False self._player_metadata_upload = 0 self._player_metadata_overall = 0 self._player_round_upload = 0 self._player_round_overall = 0 self._course_metadata_upload = 0 self._course_metadata_overall = 0 self._tournament_scrape_status_upload = False self._sg_stats_upload = False def __repr__(self): return 'MongoDB Tournament Upload Status: {}'.format( self._getUploadStatus()) def uploadTournamentDetails(self, tournament_details): result = self._tournament_db.tournament_detail.replace_one( { 'tournamentID': tournament_details['tournamentID'], 'pgaYear': tournament_details['pgaYear'] }, tournament_details, upsert=True) if result is not None: if result.upserted_id is not None: self._logger.info( 'Inserted tournament details into collection with id {}\n'. format(result.upserted_id)) else: self._logger.info( 'Updated existing tournament details with key {}\n'.format( { 'tournamentID': tournament_details['tournamentID'], 'pgaYear': tournament_details['pgaYear'] })) self._tournament_detail_upload = True def uploadPlayerMetadata(self, player_metadata): for player in player_metadata: self._player_metadata_overall += 1 if self._tournament_db.player_metadata.find_one( {"playerID": player['playerID']}) is None: result = self._tournament_db.player_metadata.insert_one(player) if result is not None: self._logger.info( 'Inserted player metadata into collection with id {}\n' .format(result.inserted_id)) self._player_metadata_upload += 1 def uploadPlayerRounds(self, player_rounds): for player in player_rounds: self._player_round_overall += 1 result = self._tournament_db.player_round.replace_one( { 'playerID': player['playerID'], 'tournamentID': player['tournamentID'], 'pgaYear': player['pgaYear'], 'roundNumber': player['roundNumber'] }, player, upsert=True) if result is not None: if result.upserted_id is not None: self._logger.info( 'Inserted player rounds into collection with id {}\n'. format(result.upserted_id)) else: self._logger.info( 'Updated existing player rounds with key {}\n'.format({ 'playerID': player['playerID'], 'tournamentID': player['tournamentID'], 'pgaYear': player['pgaYear'], 'roundNumber': player['roundNumber'] })) self._player_round_upload += 1 def uploadCourseMetadata(self, course_metadata): for course in course_metadata: self._course_metadata_overall += 1 result = self._tournament_db.course_metadata.replace_one( { 'courseID': course['courseID'], 'tournamentID': course['tournamentID'], 'pgaYear': course['pgaYear'] }, course, upsert=True) if result is not None: if result.upserted_id is not None: self._logger.info( 'Inserted course metadata into collection with id {}\n' .format(result.upserted_id)) else: self._logger.info( 'Updated existing course metadata with key {}\n'. format({ 'courseID': course['courseID'], 'tournamentID': course['tournamentID'], 'pgaYear': course['pgaYear'] })) self._course_metadata_upload += 1 def uploadTournamentScrapeStatus(self, scrape_status): result = self._tournament_db.tournament_scrape_status.replace_one( { 'tournamentID': scrape_status['tournamentID'], 'pgaYear': scrape_status['pgaYear'] }, scrape_status, upsert=True) if result is not None: if result.upserted_id is not None: self._logger.info( 'Inserted tournament scrape status into collection with id {}\n' .format(result.upserted_id)) else: self._logger.info( 'Updated existing tournament scrape status with key {}\n'. format({ 'tournamentID': scrape_status['tournamentID'], 'pgaYear': scrape_status['pgaYear'] })) self._tournament_scrape_status_upload = True def _getUploadStatus(self): return '{} {}\n'.format(self._year, self._name) + \ 'Tournament Details Uploaded: {}\n'.format(self._tournament_detail_upload) + \ 'Player Metadata Uploaded: {} new players of {} total players\n'.format(self._player_metadata_upload, self._player_metadata_overall) \ + \ 'Player Rounds Uploaded: {} of {} possible\n'.format(self._player_round_upload, self._player_round_overall) + \ 'Course Metadata Uploaded: {} of {} possible\n'.format(self._course_metadata_upload, self._course_metadata_overall) + \ 'Tournament Scrape Status Uploaded: {}\n'.format(self._tournament_scrape_status_upload)
class dfHandler: max_hole_dist = 700 max_green_dist = 50 arg_green_dist = 30 long_putt_dist = 12 adv_pct = .5 pd.set_option('display.max_columns', None) @staticmethod def getNameAbbr(row): return '. '.join([row.firstName[0], row.lastName]) @staticmethod def getShotType(row): if row.shotDistance == 0: val = 'Penalty' elif row.fromSurface == 'OTB' and row.par in [4, 5]: val = 'TEE' elif row.fromSurface in ['OGR', 'OCO']: if row.startDistance > dfHandler.long_putt_dist * 12: if row.fromSurface == 'OCO': val = 'ARG' else: val = 'LNG_PUTT' else: val = 'SHT_PUTT' elif row.fromSurface in ['OFW', 'ORO', 'OST', 'OIR', 'ONA', 'OTH', 'OTB', 'OWL', 'OBR', 'OWA'] \ and row.startDistance > (36 * dfHandler.arg_green_dist): val = 'APP' elif row.fromSurface in ['OFW', 'ORO', 'OST', 'OIR', 'ONA', 'OTH', 'OGS', 'OWL', 'OBR', 'OWA']: val = 'ARG' else: print('Unidentified from val {}'.format(row.fromSurface)) val = 'Unknown' return val @staticmethod def getEndLocation(row): if row.to in ['ELI', 'ELF', 'ELR', 'EG5', 'EG6', 'EG7']: direction = 'Left' elif row.to in ['ERI', 'ERF', 'ERR', 'EG2', 'EG1', 'EG3']: direction = 'Right' elif row.toSurface == 'Penalty': direction = 'Penalty' else: direction = '' if row.to == 'OGR': val = 'Green' elif row.to == 'hole': val = 'Hole' elif row.to in ['ELF', 'ERF', 'ERI', 'ELI', 'OFW', 'OIR', 'OCO']: val = 'Fairway' elif row.to in ['ERR', 'ELR', 'ORO', 'OCA', 'OWL', 'OBR']: val = 'Rough' elif row.to in ['OST', 'EG2', 'EG5', 'EG6', 'EG1', 'EG4', 'EG3', 'EG7', 'OGS', 'EG8']: val = 'Bunker' elif row.to in ['ONA', 'OTH', 'OUK', 'OTB']: val = 'Trouble' elif row.to == 'OWA': val = 'Water' else: print('Unidentified to val {}'.format(row.to)) val = 'Unknown' return direction, val @staticmethod def getDateTimes(dates_str): dates, year = dates_str.strip().split(',') first_day, last_day = dates.strip().split('-') return datetime.strptime('{} {}'.format(first_day.strip(), year), '%A %b %d %Y'), datetime.strptime( '{} {}'.format(last_day.strip(), year), '%A %b %d %Y') @staticmethod def getQuantiles(df, grouping='shotType', cut_on='distanceLeft'): shot_types = df.groupby(by=grouping) for name, group in shot_types: quantile = 20 for i in range(20): if (group[cut_on].count() <= quantile) or \ (len(np.unique( np.quantile(group[cut_on], np.linspace(0, 1, quantile, endpoint=False)))) < quantile): quantile -= 1 else: break pct_labels = [] for x in np.linspace(0, 100, quantile, endpoint=False): pct_labels.append('({:.2f}% to {:.2f}%]'.format(x, x + 100 / quantile)) pct_labels.reverse() df['distanceLeftQuantileBin{}'.format(name)] = pd.qcut(group[cut_on], q=quantile, precision=0, labels=pct_labels) return df @staticmethod def getBinValues(cut_on, end_bin, interval, yds_or_feet): if yds_or_feet == 'ft': multiplier = 3 else: multiplier = 1 labels = [] for x in range(0, end_bin * multiplier, interval): labels.append('({} to {}] {}'.format(x, x + interval, yds_or_feet)) return pd.cut(x=cut_on, bins=np.linspace(0, end_bin * 36, int((end_bin * multiplier) / interval) + 1), precision=0, labels=labels, include_lowest=True, right=True) @staticmethod def createHoleLevelDict(tournament_year_dict): year_course_hole_round = {} for pga_year in tournament_year_dict.keys(): dates_str = tournament_year_dict[pga_year]['dates'] first_dt, last_dt = dfHandler.getDateTimes(dates_str) course_dict = {} for course in tournament_year_dict[pga_year]['courses']: hole_based_dict = {} course_id = course['courseID'] for course_hole in course['holes']: hole_based_dict[course_hole['holeNumber']] = {} for i, round_info in enumerate(course_hole['rounds']): hole_based_dict[course_hole['holeNumber']][round_info['round_Id']] \ = {k: round_info[k] for k in round_info if k != 'round_Id'} hole_based_dict[course_hole['holeNumber']][round_info['round_Id']].update( {'roundDate': first_dt + timedelta(days=i), 'playerShots': {}}) for player_round in tournament_year_dict[pga_year]['playerRounds']: if course_id != player_round['courseId']: continue for player_hole in player_round['holes']: hole_based_dict[player_hole['holeNumber']][player_round['roundNumber']][ 'playerShots'][player_round['playerID']] = player_hole['shots'] course_dict[course_id] = hole_based_dict year_course_hole_round[pga_year] = course_dict return year_course_hole_round def __init__(self, mongo_obj, tournament_name_scrape, tournament_name_sg, force_create_sg=False, force_create_tournament=False): self._logger = MyLogger('dfHandler', 'Analysis/logs/dfHandler.log', logging.INFO).getLogger() self._tournament_name = tournament_name_scrape self._mongo_obj = mongo_obj mongo_download = MongoDownload(self._mongo_obj) self._mongo_upload_df = MongoUploadDF(self._mongo_obj.getTournamentDB(), self._tournament_name) self._raw_sg_df = pd.DataFrame(mongo_download.getRawSG_DF(tournament_name_scrape)) if self._raw_sg_df.empty or force_create_sg: self._logger.info('Creating New Raw SG DF') self._raw_sg_df = pd.DataFrame() self._createRawSG_DF(mongo_download.getSGStatsForTournament(tournament_name_scrape, tournament_name_sg), mongo_download.getPlayerNames()) self._tournament_df = pd.DataFrame(mongo_download.getTournamentDF(tournament_name_scrape)) if self._tournament_df.empty or force_create_tournament: self._logger.info('Creating New Tournament DF') self._tournament_df = pd.DataFrame() self._createTournamentDF(mongo_download.consolidateTournamentInfo(tournament_name_scrape), mongo_download.getPlayerNames()) def __repr__(self): success = True if self._tournament_df.empty: success = False return 'Tournament {} DF successfully created {}\n'.format(self._tournament_name, success) def _dfLogic(self, hole_df, year, course, hole_num, round_num): hole_df = hole_df.rename(columns={'distance': 'holeDistance'}) hole_df['holeDistance'] = hole_df.holeDistance.astype(int) * 36 hole_df['par'] = hole_df.par.astype(int) hole_df['stimp'] = hole_df.stimp.astype(np.float16) hole_df['roundDate'] = pd.to_datetime(hole_df.roundDate) hole_df['pgaYear'] = year hole_df['courseID'] = course hole_df['holeNum'] = hole_num hole_df['roundNum'] = round_num hole_df = hole_df[hole_df.playerShots.map(lambda l: len(l)) > 0] hole_df = hole_df.explode('playerShots') temp_df = pd.json_normalize(hole_df.playerShots) hole_df = pd.concat([hole_df.reset_index().drop(columns='playerShots'), temp_df], axis=1) del temp_df hole_df = hole_df.rename(columns={'distance': 'shotDistance', 'from': 'fromSurface', 'left': 'distanceLeft', 'index': 'playerID'}) hole_df['startDistance'] = np.nan hole_df.loc[hole_df.fromSurface == 'OTB', 'startDistance'] = hole_df.holeDistance hole_df.drop(columns='holeDistance', inplace=True) hole_df['startDistance'] = hole_df.startDistance.fillna(value=hole_df.distanceLeft.shift(1)) player_group = hole_df.groupby(by='playerID', group_keys=False) hole_df = hole_df[player_group.apply(lambda x: x.shot_id != x.shot_id.shift(1))] player_group = hole_df.groupby(by='playerID') hole_df['playerScore'] = player_group.shot_id.transform('max') hole_df['holeAvg'] = player_group.shot_id.max().mean() hole_df['shotsRemaining'] = player_group.cumcount(ascending=False) hole_df['shotType'] = hole_df.apply(dfHandler.getShotType, axis=1) hole_df['isAdvanced'] = (hole_df['shotType'] == 'APP') & \ (hole_df.distanceLeft > (self.adv_pct * hole_df.startDistance)) hole_df['toSurface'] = hole_df.shotType.shift(-1) hole_df[['toLocation', 'toSurface']] = hole_df.apply(dfHandler.getEndLocation, axis=1, result_type='expand') hole_df.drop(hole_df[hole_df.shotType == 'Penalty'].index, inplace=True) hole_df.loc[hole_df.toLocation == 'Penalty', 'distanceLeft'] = \ hole_df.startDistance.shift(-1).fillna(0) hole_df['isReTee'] = hole_df.apply( lambda x: x['startDistance'] == x['distanceLeft'] and x['shotType'] == 'TEE', axis=1) hole_df = self._getDistanceBins(hole_df) # self._logger.info('\nHole DF description\n{}'. # format(hole_df.describe(percentiles=[.5]).T)) return hole_df def _getDistanceBins(self, hole_df): hole_df.loc[hole_df['shotType'] == 'TEE', 'startDistance10ydBin'] = dfHandler.getBinValues( hole_df[hole_df['shotType'] == 'TEE'].startDistance, self.max_hole_dist, 10, 'yds') hole_df.loc[(hole_df['shotType'] == 'TEE') | (hole_df['shotType'] == 'APP'), 'distanceLeft5ydBin'] = \ dfHandler.getBinValues(hole_df[(hole_df['shotType'] == 'TEE') | (hole_df['shotType'] == 'APP')]. distanceLeft, self.max_hole_dist, 5, 'yds') hole_df.loc[hole_df['shotType'] == 'APP', 'distanceLeft1ydBin'] = \ dfHandler.getBinValues(hole_df[hole_df['shotType'] == 'APP']. distanceLeft, self.max_green_dist, 1, 'yd') hole_df.loc[hole_df['shotType'] != 'TEE', 'distanceLeft1ftBin'] = dfHandler.getBinValues( hole_df[hole_df['shotType'] != 'TEE'].distanceLeft, self.max_green_dist, 1, 'ft') return hole_df def _createTournamentDF(self, tournament_year_dict, player_names): year_course_hole_round = dfHandler.createHoleLevelDict(tournament_year_dict) for year in year_course_hole_round.keys(): for course in year_course_hole_round[year].keys(): for hole_num in year_course_hole_round[year][course].keys(): for round_num in year_course_hole_round[year][course][hole_num].keys(): if not year_course_hole_round[year][course][hole_num][round_num]['playerShots']: continue self._logger.info( 'Creating hole level DF for tournament {}, year {}, course {}, hole {}, round {}\n' .format(self._tournament_name, year, course, hole_num, round_num)) hole_df = pd.DataFrame.from_dict( year_course_hole_round[year][course][hole_num][round_num]) self._tournament_df = self._tournament_df.append( self._dfLogic(hole_df, year, course, hole_num, round_num)) player_name_df = pd.DataFrame(player_names) self._tournament_df = pd.merge(self._tournament_df, player_name_df, on='playerID', how='left') self._tournament_df.reset_index() def _createRawSG_DF(self, sg_dict, player_names): player_name_df = pd.DataFrame(player_names) # noinspection PyTypeChecker player_name_df['playerName'] = player_name_df.apply(dfHandler.getNameAbbr, axis=1) for year in sg_dict.keys(): self._raw_sg_df = self._raw_sg_df.append(sg_dict[year]['sgStats']) numeric_cols = ['sgPUTT', 'sgARG', 'sgAPP', 'sgOTT', 'sgT2G', 'sgTOT'] self._raw_sg_df[numeric_cols] = self._raw_sg_df[numeric_cols].apply(pd.to_numeric, axis=1) self._raw_sg_df = pd.merge(self._raw_sg_df, player_name_df, on='playerName', how='left') self._raw_sg_df.drop(columns=['playerName', 'tournamentName'], inplace=True) def getTournamentDF(self): return self._tournament_df def getRawSG_DF(self): return self._raw_sg_df def uploadTournamentDF(self): for course, course_tournament_df in self._tournament_df.groupby('courseID'): for year, year_tournament_df in course_tournament_df.groupby('pgaYear'): for round_num, round_tournament_df in year_tournament_df.groupby('roundNum'): df_dict = round_tournament_df.drop(columns='shottext').to_dict('records') upload_dict = {'tournamentName': self._tournament_name, 'courseID': course, 'pgaYear': year, 'roundNum': round_num, 'df': df_dict} self._mongo_upload_df.uploadTournamentDF(upload_dict) def uploadRawSG_DF(self): for year, year_tournament_df in self._raw_sg_df.groupby('pgaYear'): df_dict = year_tournament_df.to_dict('records') upload_dict = {'tournamentName': self._tournament_name, 'pgaYear': year, 'df': df_dict} self._mongo_upload_df.uploadRawSG_DF(upload_dict)
import logging import pandas as pd from Analysis.dfHandler import dfHandler from Analysis.sgHandler import sgHandler from Logging.MyLogger import MyLogger from MongoDB.MongoInitialization import MongoInitialization if __name__ == '__main__': analysis_logger = MyLogger('Analysis', 'Analysis/logs/hole_df.log', logging.INFO).getLogger() mongo_init = MongoInitialization('df') df_handler = dfHandler(mongo_init, 'waste-management-phoenix-open', 'Waste Management Phoenix Open', False, False) tournament_df = df_handler.getTournamentDF() sg_df = df_handler.getRawSG_DF() # df_handler.uploadTournamentDF() # df_handler.uploadRawSG_DF() sg_handler = sgHandler(mongo_init, tournament_df, sg_df) sg_handler.applySGLogicToGroups(True) # sg_handler.getSGTee(False) sg_df_dict = sg_handler.getSG_DF_Dict() # combine = pd.merge(sg_df_dict['Tee']['RawSGMatch'], sg_df, how='left', on=['playerID', 'pgaYear'])
class sgHandler: grouping_list = [('Course', ['courseID']), ('Year', ['pgaYear']), ('Round', ['roundNum']), ('Hole', ['holeNum']), ('YearRound', ['pgaYear', 'roundNum']), ('YearHole', ['pgaYear', 'holeNum']), ('YearRoundHole', ['pgaYear', 'roundNum', 'holeNum'])] pd.set_option('display.max_columns', None) @staticmethod def lowessExpectedShotsByDistance(distance_shots): new_df = distance_shots.str.split('/', expand=True) endog = new_df[[1]].values.ravel() exog = new_df[[0]].values.ravel() return pd.Series(lowess(endog=endog, exog=exog, return_sorted=False)) @staticmethod def lmExpectedShotsByDistance(distance_shots): new_df = distance_shots.str.split('/', expand=True) lm = LinearRegression() lm.fit(new_df[[0]], new_df[[1]]) return lm.predict(new_df[[0]]).flatten() @staticmethod def lmExpectedRemainingShotsGroup(df, group, name): df['lmExpectedShotsRemaining{}'.format(name)] = df.groupby(group)['distance/shots']. \ transform(sgHandler.lmExpectedShotsByDistance) return df @staticmethod def lowessExpectedRemainingShotsColumn(df, group, name): df['lowessExpectedShotsRemaining{}'.format(name)] = df.groupby(group)['distance/shots']. \ transform(sgHandler.lowessExpectedShotsByDistance) return df @staticmethod def fiveYdBinExpectedRemainingShotsColumn(df, group, name): bin_group = group + ['distanceLeft5ydBin'] df['5ydBinAvgExpectedShotsRemaining{}'.format(name)] = df.groupby( bin_group)['shotsRemaining'].transform('mean') return df @staticmethod def oneFtBinExpectedRemainingShotsColumn(df, group, name): bin_group = group + ['distanceLeft1ftBin'] df['1ftBinAvgExpectedShotsRemaining{}'.format(name)] = df.groupby( bin_group)['shotsRemaining'].transform('mean') return df @staticmethod def getGroupAveragesAndSGOverAvg(df, name, group, column_to_avg, sg_type, column_to_subtract): df['{}{}'.format(name, column_to_avg)] = \ df.groupby(group).transform('mean')[column_to_avg] df['SG{}Over_{}'.format(sg_type, column_to_avg)] = df['{}{}'.format(name, column_to_avg)] - \ df[column_to_subtract] return df @staticmethod def getGroupSTDofSGOverAvg(df, name, group_by_cols, column_to_avg, sg_type): df['{}STDofSG{}'.format(name, sg_type)] = \ df.groupby(group_by_cols)['SG{}Over_{}'.format(sg_type, column_to_avg)].transform('std') df['NumSTDFromSG{}Over_{}'.format(sg_type, column_to_avg)] = \ abs(df['SG{}Over_{}'.format(sg_type, column_to_avg)] / df['{}STDofSG{}'.format(name, sg_type)]) return df # @staticmethod # def createSGTeeColumns(df, name): # df['SGTeeOverLM{}'.format(name)] = df['lmExpectedShotsRemaining{}'.format(name)] - df['shotsRemaining'] # df['SGTeeOverLowess{}'.format(name)] = df['lowessExpectedShotsRemaining{}'.format(name)] - df[ # 'shotsRemaining'] # df['SGTeeOverBinAvg{}'.format(name)] = df['5ydBinAvgExpectedShotsRemaining{}'.format(name)] - \ # df['shotsRemaining'] # return df # @staticmethod # def visualizeDistanceLeft(df, title): # _ = sns.lmplot(data=df, x='distanceLeft', y='shotsRemaining', hue='toSurface') # plt.title(title + ' LM') # plt.show() # _ = sns.lmplot(data=df, x='distanceLeft', y='shotsRemaining', hue='toSurface', lowess=True) # plt.title(title + ' Lowess') # plt.show() # distance_grouped = df.groupby(['distanceLeft5ydBin', 'toSurface']).mean().reset_index() # _ = sns.scatterplot(data=distance_grouped, x='distanceLeft', y='shotsRemaining', hue='toSurface') # plt.title(title + ' 5ydBin') # plt.show() # @staticmethod # def visualizeStartDistance(df, group, title): # _ = sns.lmplot(data=df, x='startDistance', y='shotsTaken', hue=group) # plt.title(title + ' LM') # plt.show() # _ = sns.lmplot(data=df, x='startDistance', y='shotsTaken', hue=group, lowess=True) # plt.title(title + ' Lowess') # plt.show() @staticmethod def getStartingExpectedShots(tee_shots_df, visualize): tee_shots_df['shotsTaken'] = tee_shots_df['shotsRemaining'] + 1 tee_shots_df['distance/shots'] = tee_shots_df.apply( lambda x: str(x['startDistance']) + '/' + str(x['shotsTaken']), axis=1) tee_shots_df['lmExpectedShotsStarting'] = tee_shots_df['distance/shots']. \ transform(sgHandler.lmExpectedShotsByDistance) if visualize: _ = sns.lmplot(data=tee_shots_df, x='startDistance', y='shotsTaken', lowess=True) plt.title('Expected Shots From Start Distance Lowess Model') plt.show() return tee_shots_df @staticmethod def getRemainingExpectedShots(tee_shots_df, visualize=False): tee_shots_df['distance/shots'] = tee_shots_df.apply( lambda x: str(x['distanceLeft']) + '/' + str(x['shotsRemaining']), axis=1) no_retee_df = tee_shots_df[~tee_shots_df['isReTee']].copy() tee_shots_df['lmExpectedShotsRemainingBySurface'] = no_retee_df.groupby('toSurface')['distance/shots']. \ transform(sgHandler.lmExpectedShotsByDistance) if visualize: _ = sns.lmplot(data=tee_shots_df, x='distanceLeft', y='shotsRemaining', hue='toSurface', lowess=True) plt.title( 'Expected Shots For Distance Left Grouped By Surface Lowess Model' ) plt.show() return tee_shots_df # @staticmethod # def getSGMeasure(df, sg_measure, starting_col, shots_remain_col, add_stroke): # df['SG{}Over{}'.format(sg_measure, starting_col)] = df[starting_col] - df[shots_remain_col] - add_stroke # return df # @staticmethod # def getSGReTee(df, sg_measure, starting_col): # df['SG{}Over{}'.format(sg_measure, starting_col)] = -1 # return df def __init__(self, mongo_obj, tournament_df, raw_sg_df, distances_df=None): self._sg_df_dict = {} self._logger = MyLogger('sgHandler', 'Analysis/logs/sgHandler.log', logging.INFO).getLogger() self._tournament_df = tournament_df self._raw_sg_df = raw_sg_df self._mongo_obj = mongo_obj self._distances_df = distances_df def __repr__(self): return 'SG DF Dictionary has keys {}\n'.format(self._sg_df_dict.keys()) def applySGLogicToGroups(self, visualize=False): for name, group in sgHandler.grouping_list: self._logger.info('Creating SG Stats for group {}'.format(name)) self._sg_df_dict[name] = {} if 'Hole' in name: self._sg_df_dict[name]['Total'] = self.getSGOverall( name, group, visualize) self._sg_df_dict[name]['Total'] = self.getSGOverall( name, group, visualize) def getSGOverall(self, name, group_by_cols, visualize): self._logger.info('Getting SG Overall {} Stats'.format(name)) relevant_cols = [ 'pgaYear', 'courseID', 'holeNum', 'roundNum', 'holeAvg', 'playerScore' ] sg_tot_df = self._tournament_df.loc[self._tournament_df['shot_id'] == 1, relevant_cols].copy() # sg_tot_df['SGTotOverHoleAvg'] = sg_tot_df['holeAvg'] - sg_tot_df['playerScore'] self._logger.info( 'Getting SG Tot For Grouping by {}'.format(group_by_cols)) sg_tot_df = sgHandler.getGroupAveragesAndSGOverAvg( sg_tot_df, name, group_by_cols, 'holeAvg', 'Tot', 'playerScore') sg_tot_df = sgHandler.getGroupSTDofSGOverAvg(sg_tot_df, name, group_by_cols, 'holeAvg', 'Tot') if visualize: _ = sns.histplot(data=sg_tot_df, x='SGTotOver_holeAvg'.format(name), kde=True, hue='holeNum', binwidth=.25, kde_kws={'bw_adjust': 4}) plt.show() _ = sns.histplot(data=sg_tot_df, x='NumSTDFromSGTotOver_holeAvg'.format(name), kde=True, hue='holeNum', binwidth=.25, kde_kws={'bw_adjust': 4}) plt.show() return sg_tot_df def sumSGTotalDFs(self, sg_tot_df): self._sg_df_dict['Total']['SumByRound'] = sg_tot_df.groupby(['playerID', 'pgaYear', 'roundNum']).sum(). \ reset_index() sg_cols = [col for col in sg_tot_df if 'SG' in col] self._sg_df_dict['Total']['RawSGMatch'] = sg_tot_df.groupby(['playerID', 'pgaYear']). \ apply(lambda x: x[sg_cols].sum() / x['roundNum'].nunique()).reset_index() def getSGTee(self, visualize=False): relevant_cols = [ 'playerID', 'firstName', 'lastName', 'pgaYear', 'courseID', 'holeNum', 'roundNum', 'par', 'startDistance', 'startDistance10ydBin', 'distanceLeft', 'distanceLeft5ydBin', 'distanceLeft1ydBin', 'distanceLeft1ftBin', 'toSurface', 'shotsRemaining', 'isReTee' ] tee_shots_df = self._tournament_df[( self._tournament_df['shotType'] == 'TEE')][relevant_cols].copy() tee_shots_df = sgHandler.getStartingExpectedShots( tee_shots_df, visualize) tee_shots_df = sgHandler.getRemainingExpectedShots( tee_shots_df, visualize) tee_shots_df.drop(columns='distance/shots', inplace=True) tee_shots_df['SGTeeByLowess'] = tee_shots_df['lmExpectedShotsStartingGrouped'] - \ tee_shots_df['lmExpectedShotsRemainingBySurface'] - 1 tee_shots_df['SGTeeByLowess'].fillna(-2, inplace=True) if visualize: _ = sns.histplot(data=tee_shots_df, x='SGTeeByLowess', kde=True, hue='holeNum', binwidth=.25, kde_kws={'bw_adjust': 4}) plt.show() self._sg_df_dict['Tee'] = {} tee_shots_df['NumSTDFromSGTeeByLowess'] = abs( tee_shots_df['SGTeeByLowess'] / tee_shots_df.groupby( ['pgaYear', 'roundNum'])['SGTeeByLowess'].transform('std')) tee_shots_df['AvgSGTeeByLowess'] = tee_shots_df.groupby(['pgaYear', 'roundNum'])['SGTeeByLowess']. \ transform('mean') self._sg_df_dict['Tee']['RoundBased'] = tee_shots_df self._sg_df_dict['Tee']['SumByRound'] = tee_shots_df.groupby(['playerID', 'pgaYear', 'roundNum']).sum(). \ reset_index() sg_cols = [col for col in tee_shots_df if 'SG' in col] self._sg_df_dict['Tee']['RawSGMatch'] = tee_shots_df.groupby(['playerID', 'pgaYear']). \ apply(lambda x: x[sg_cols].sum() / x['roundNum'].nunique()).reset_index() def getSG_DF_Dict(self): return self._sg_df_dict
class SGScraper: """Given a tournament and year, this scrapes pgatour.com tournament result page to create json files containing data on tournament info and player course_hole by course_hole shots""" def __init__(self): """Initialize SG Scraper""" self._sg_url = 'https://datagolf.com/historic-event-data' # create place holder dictionaries for data once scraped self._tournament_sg_col = [] # all I/O done in tournaments/'pga_year'_'tournament_name' directory self._file_handler = 'tournaments/SG/logs/sg_scape.log' # initialize logger self._logger = MyLogger(self.__class__.__name__, self._file_handler, logging.INFO, 'w').getLogger() # initialize driver self.web_driver = WebDriver(self._logger) self.year_options = None def __repr__(self): """Print Scraper Class with scraped status""" return self.__class__.__name__ def _sgStatsToDict(self, year_name, tournament_name, sg_stats): self._logger.info('Getting SG stats for {} during {} {}'.format( sg_stats[0], year_name, tournament_name)) self._tournament_sg_col.append({ 'pgaYear': year_name, 'tournamentName': tournament_name, 'playerName': sg_stats[0], 'sgPUTT': sg_stats[1], 'sgARG': sg_stats[2], 'sgAPP': sg_stats[3], 'sgOTT': sg_stats[4], 'sgT2G': sg_stats[5], 'sgTOT': sg_stats[6] }) def runScrape(self, years_to_scrape): """""" self._logger.info('Go to SG Scrape url {}\n'.format(self._sg_url)) self.web_driver.goToURL(self._sg_url) driver = self.web_driver.getDriver() try: tournament_selector = Select(driver.find_element_by_id('dropdown')) num_options = len(tournament_selector.options) for idx in range(num_options): tournament_selector.select_by_index(idx) tournament_name = tournament_selector.first_selected_option.text _ = self.web_driver.webDriverWait( driver, wait_for_text_to_match((By.CLASS_NAME, 'subtitle'), r'\d+ {}'.format(tournament_name)), 'Error waiting for tournament to load\n{}') self.year_options = driver.find_elements_by_class_name( 'yearoptions') for year in reversed(self.year_options): year_name = year.text if year_name not in years_to_scrape: continue self._logger.info('\nRunning SG Scrape for {} {}'.format( year_name, tournament_name)) year.click() sg_table = driver.find_element_by_class_name('table') data_rows = sg_table.find_elements_by_class_name('datarow') for row in data_rows: sg_stats = row.text.split('\n') if sg_stats[3] == '--': self._logger.info('No SG stats for {} {}'.format( year_name, tournament_name)) break self._sgStatsToDict( year_name, tournament_name, [sg_stats[i] for i in (1, 3, 4, 5, 6, 7, 8)]) return True except Exception as e: self._logger.error('Failed running SG scrape due to {}'.format(e), exc_info=True) return False def getSGCollection(self): return self._tournament_sg_col