Esempio n. 1
0
 def __init__(self, tournament_db):
     """For uploading SG collection objects to MongoDB"""
     self._tournament_db = tournament_db
     self._logger = MyLogger('MongoDB SG',
                             'tournaments/SG/logs/sg_mongodb.log',
                             logging.INFO).getLogger()
     self._sg_stats_upload = 0
     self._sg_stats_overall = 0
Esempio n. 2
0
 def __init__(self, mongo_obj, tournament_df, raw_sg_df, distances_df=None):
     self._sg_df_dict = {}
     self._logger = MyLogger('sgHandler', 'Analysis/logs/sgHandler.log',
                             logging.INFO).getLogger()
     self._tournament_df = tournament_df
     self._raw_sg_df = raw_sg_df
     self._mongo_obj = mongo_obj
     self._distances_df = distances_df
Esempio n. 3
0
class MongoInitialization:
    def __init__(self, called_from):
        """For connecting and set up to MongoDB"""
        self.connection_str = '{}'.format(MY_MONGO_DB_KEY)
        self._logger = MyLogger('MongoDB', 'MongoDB/logs/mongodb.log',
                                logging.INFO).getLogger()
        self._logger.info('Connecting to MongoDB...\n')
        self._client = pymongo.MongoClient(self.connection_str)
        self._tournament_db = self._client.tournament_db
        self._logger.info('Client description {}\n'.format(self._client))
        self._logger.info('Tournament DB description {}\n'.format(
            self._tournament_db))
        col_names = self._tournament_db.collection_names()
        self._logger.info(
            'TournamentDB has the following collections {}\n'.format(
                col_names))
        if called_from == 'scraper':
            self._createCollection('tournament_detail', [('tournamentID', 1),
                                                         ('pgaYear', -1)])
            self._createCollection('player_metadata', [('playerID', 1)])
            self._createCollection('player_round', [('playerID', 1),
                                                    ('tournamentID', 1),
                                                    ('pgaYear', -1),
                                                    ('roundNumber', 1)])
            self._createCollection('course_metadata', [('courseID', 1),
                                                       ('tournamentID', 1),
                                                       ('pgaYear', -1)])
            self._createCollection('tournament_scrape_status',
                                   [('tournamentName', 1), ('pgaYear', -1)])
        elif called_from == 'sg':
            self._createCollection('sg_stats', [('tournamentName', 1),
                                                ('pgaYear', -1),
                                                ('playerName', 1)])
        elif called_from == 'df':
            self._createCollection('tournament_df', [('tournamentName', 1),
                                                     ('courseID', 1),
                                                     ('pgaYear', -1),
                                                     ('roundNum', 1)])
            self._createCollection('raw_sg_df', [('tournamentName', 1),
                                                 ('pgaYear', -1)])

    def _createCollection(self, collection_name, index_dict):
        if collection_name not in self._tournament_db.collection_names():
            new_col = self._tournament_db[collection_name]
            idx = new_col.create_index(index_dict, unique=True)
            self._logger.info('Created {} Collection with index {}\n'.format(
                collection_name, idx))

    def __repr__(self):
        return 'MongoDB Client is {}\nTournament DB is {}\n'.format(
            self._client, self._tournament_db)

    def getTournamentDB(self):
        return self._tournament_db

    def getLogger(self):
        return self._logger
Esempio n. 4
0
 def __init__(self, tournament_db, tournament_name):
     """For uploading tournament DF to MongoDB"""
     self._tournament_db = tournament_db
     self._name = tournament_name
     self._logger = MyLogger(
         'MongoDB Tournament DF {}'.format(self._name),
         'tournaments/DFs/{}/logs/tournament_mongodb.log'.format(
             self._name), logging.INFO).getLogger()
     self._tournament_df_upload = False
     self._raw_sg_df_upload = False
Esempio n. 5
0
    def __init__(self):
        """Initialize SG Scraper"""
        self._sg_url = 'https://datagolf.com/historic-event-data'

        # create place holder dictionaries for data once scraped
        self._tournament_sg_col = []

        # all I/O done in tournaments/'pga_year'_'tournament_name' directory
        self._file_handler = 'tournaments/SG/logs/sg_scape.log'

        # initialize logger
        self._logger = MyLogger(self.__class__.__name__, self._file_handler,
                                logging.INFO, 'w').getLogger()

        # initialize driver
        self.web_driver = WebDriver(self._logger)
        self.year_options = None
Esempio n. 6
0
 def __init__(self, tournament_db, tournament_year, tournament_name):
     """For uploading tournament scrape collection objects to MongoDB"""
     self._tournament_db = tournament_db
     self._year = tournament_year
     self._name = tournament_name
     self._logger = MyLogger(
         'MongoDB {} {}'.format(self._year, self._name),
         'tournaments/{}_{}/logs/tournament_mongodb.log'.format(
             self._year, self._name), logging.INFO).getLogger()
     self._tournament_detail_upload = False
     self._player_metadata_upload = 0
     self._player_metadata_overall = 0
     self._player_round_upload = 0
     self._player_round_overall = 0
     self._course_metadata_upload = 0
     self._course_metadata_overall = 0
     self._tournament_scrape_status_upload = False
     self._sg_stats_upload = False
Esempio n. 7
0
class MongoUploadSG:
    def __init__(self, tournament_db):
        """For uploading SG collection objects to MongoDB"""
        self._tournament_db = tournament_db
        self._logger = MyLogger('MongoDB SG',
                                'tournaments/SG/logs/sg_mongodb.log',
                                logging.INFO).getLogger()
        self._sg_stats_upload = 0
        self._sg_stats_overall = 0

    def __repr__(self):
        return 'MongoDB SG Upload Status: {}'.format(self._getUploadStatus())

    def uploadSGStats(self, sg_stats_list):
        for sg_stats in sg_stats_list:
            self._sg_stats_overall += 1
            result = self._tournament_db.sg_stats.replace_one(
                {
                    'playerName': sg_stats['playerName'],
                    'tournamentName': sg_stats['tournamentName'],
                    'pgaYear': sg_stats['pgaYear']
                },
                sg_stats,
                upsert=True)
            if result is not None:
                if result.upserted_id is not None:
                    self._logger.info(
                        'Inserted SG stats into collection with id {}\n'.
                        format(result.upserted_id))
                else:
                    self._logger.info(
                        'Updated existing sg stats with key {}\n'.format({
                            'playerName':
                            sg_stats['playerName'],
                            'tournamentName':
                            sg_stats['tournamentName'],
                            'pgaYear':
                            sg_stats['pgaYear']
                        }))
                self._sg_stats_upload += 1

    def _getUploadStatus(self):
        return 'SG Stats Uploaded: {} of {} possible\n'.format(
            self._sg_stats_upload, self._sg_stats_overall)
Esempio n. 8
0
    def __init__(self, pga_tournament, pga_year, driver=None):
        """Initialize scraper with tournament, year, optional logger name, wire requests dict, web driver"""
        self._pga_tournament = pga_tournament
        self._pga_year = pga_year
        self._tournament_url = 'https://www.pgatour.com/competition/' + pga_year + '/' + pga_tournament + \
                               '/leaderboard.html'
        self.successfully_scraped = 0
        self.tournament_id = None

        # create place holder dictionaries for data once scraped
        self._course_ids = set()
        self._tournament_info_dict = {}
        self._player_meta_dict = {}
        self._course_general_dict = {}
        self._course_meta_dict = {}
        self._player_round_dict = {}
        self._unsuccessful_player_round_scrape = {}
        self._course_requests = {}
        self._row_dict = {}

        # use this default dictionary as template for wire requests
        self.template_wire_html_dict = {
            'tournament_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/leaderboard.json',
            'course_general': 'https://statdata.pgatour.com/r/TOURNAMENT_ID/course.json',
            'course_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/courseC_ID',
            'round_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/drawer/rROUND_NUM-mMAIN_PLAYER_ID'
        }

        # all I/O done in tournaments/'pga_year'_'tournament_name' directory
        self.dir = 'tournaments/' + self._pga_year + '_' + self._pga_tournament + '/'
        self._file_handler = self.dir + 'logs/tournament_scape.log'

        # initialize logger
        self._logger = MyLogger(self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament,
                                self._file_handler, logging.INFO, 'a').getLogger()

        # initialize driver
        if driver is None:
            self.web_driver = WebDriver(self._logger)
        else:
            self.web_driver = driver
        self.web_driver.updateLogLocations(' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler)
Esempio n. 9
0
    def __init__(self, mongo_obj, tournament_name_scrape, tournament_name_sg, force_create_sg=False,
                 force_create_tournament=False):
        self._logger = MyLogger('dfHandler', 'Analysis/logs/dfHandler.log', logging.INFO).getLogger()
        self._tournament_name = tournament_name_scrape
        self._mongo_obj = mongo_obj
        mongo_download = MongoDownload(self._mongo_obj)
        self._mongo_upload_df = MongoUploadDF(self._mongo_obj.getTournamentDB(), self._tournament_name)
        self._raw_sg_df = pd.DataFrame(mongo_download.getRawSG_DF(tournament_name_scrape))
        if self._raw_sg_df.empty or force_create_sg:
            self._logger.info('Creating New Raw SG DF')
            self._raw_sg_df = pd.DataFrame()
            self._createRawSG_DF(mongo_download.getSGStatsForTournament(tournament_name_scrape, tournament_name_sg),
                                 mongo_download.getPlayerNames())

        self._tournament_df = pd.DataFrame(mongo_download.getTournamentDF(tournament_name_scrape))
        if self._tournament_df.empty or force_create_tournament:
            self._logger.info('Creating New Tournament DF')
            self._tournament_df = pd.DataFrame()
            self._createTournamentDF(mongo_download.consolidateTournamentInfo(tournament_name_scrape),
                                     mongo_download.getPlayerNames())
Esempio n. 10
0
 def __init__(self, called_from):
     """For connecting and set up to MongoDB"""
     self.connection_str = '{}'.format(MY_MONGO_DB_KEY)
     self._logger = MyLogger('MongoDB', 'MongoDB/logs/mongodb.log',
                             logging.INFO).getLogger()
     self._logger.info('Connecting to MongoDB...\n')
     self._client = pymongo.MongoClient(self.connection_str)
     self._tournament_db = self._client.tournament_db
     self._logger.info('Client description {}\n'.format(self._client))
     self._logger.info('Tournament DB description {}\n'.format(
         self._tournament_db))
     col_names = self._tournament_db.collection_names()
     self._logger.info(
         'TournamentDB has the following collections {}\n'.format(
             col_names))
     if called_from == 'scraper':
         self._createCollection('tournament_detail', [('tournamentID', 1),
                                                      ('pgaYear', -1)])
         self._createCollection('player_metadata', [('playerID', 1)])
         self._createCollection('player_round', [('playerID', 1),
                                                 ('tournamentID', 1),
                                                 ('pgaYear', -1),
                                                 ('roundNumber', 1)])
         self._createCollection('course_metadata', [('courseID', 1),
                                                    ('tournamentID', 1),
                                                    ('pgaYear', -1)])
         self._createCollection('tournament_scrape_status',
                                [('tournamentName', 1), ('pgaYear', -1)])
     elif called_from == 'sg':
         self._createCollection('sg_stats', [('tournamentName', 1),
                                             ('pgaYear', -1),
                                             ('playerName', 1)])
     elif called_from == 'df':
         self._createCollection('tournament_df', [('tournamentName', 1),
                                                  ('courseID', 1),
                                                  ('pgaYear', -1),
                                                  ('roundNum', 1)])
         self._createCollection('raw_sg_df', [('tournamentName', 1),
                                              ('pgaYear', -1)])
Esempio n. 11
0
class TournamentScraper:
    """Given a tournament and year, this scrapes pgatour.com tournament result
     page to create json files containing data on tournament info and player course_hole by course_hole shots"""

    def __init__(self, pga_tournament, pga_year, driver=None):
        """Initialize scraper with tournament, year, optional logger name, wire requests dict, web driver"""
        self._pga_tournament = pga_tournament
        self._pga_year = pga_year
        self._tournament_url = 'https://www.pgatour.com/competition/' + pga_year + '/' + pga_tournament + \
                               '/leaderboard.html'
        self.successfully_scraped = 0
        self.tournament_id = None

        # create place holder dictionaries for data once scraped
        self._course_ids = set()
        self._tournament_info_dict = {}
        self._player_meta_dict = {}
        self._course_general_dict = {}
        self._course_meta_dict = {}
        self._player_round_dict = {}
        self._unsuccessful_player_round_scrape = {}
        self._course_requests = {}
        self._row_dict = {}

        # use this default dictionary as template for wire requests
        self.template_wire_html_dict = {
            'tournament_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/leaderboard.json',
            'course_general': 'https://statdata.pgatour.com/r/TOURNAMENT_ID/course.json',
            'course_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/courseC_ID',
            'round_detail': 'https://lbdata.pgatour.com/PGA_YEAR/r/TOURNAMENT_ID/drawer/rROUND_NUM-mMAIN_PLAYER_ID'
        }

        # all I/O done in tournaments/'pga_year'_'tournament_name' directory
        self.dir = 'tournaments/' + self._pga_year + '_' + self._pga_tournament + '/'
        self._file_handler = self.dir + 'logs/tournament_scape.log'

        # initialize logger
        self._logger = MyLogger(self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament,
                                self._file_handler, logging.INFO, 'a').getLogger()

        # initialize driver
        if driver is None:
            self.web_driver = WebDriver(self._logger)
        else:
            self.web_driver = driver
        self.web_driver.updateLogLocations(' ' + self._pga_year + ' ' + self._pga_tournament, self._file_handler)

    def __repr__(self):
        """Print Scraper Class with year, tournament and scraped status"""
        return (self.__class__.__name__ + ' ' + self._pga_year + ' ' + self._pga_tournament
                + '\nScrape Status: Scraped {:.2f}% of potential data'.format(self.successfully_scraped))

    def _scrapeTournamentJSON(self, tournament_detail_json):
        """Insert into dictionaries from the detailed tournament info JSON"""

        # make sure pga years match
        if self._pga_year != findKeyInJSON(tournament_detail_json, 'year'):
            self._logger.warning('Error: Non-matching PGA years. User Input {}; JSON {}'
                                 .format(self._pga_year, findKeyInJSON(tournament_detail_json, 'year')))

        # cut line data
        cut_line_info = findKeyInJSON(tournament_detail_json, 'cutLines')
        cut_dict = {'cuts': []}
        for i, cut in enumerate(cut_line_info, start=1):
            cut_dict['cuts'].append({
                'cutNumber': i,
                'cutCount': cut['cut_count'],
                'cutScore': cut['cut_line_score'],
                'cutPaidCount': cut['paid_players_making_cut']
            })
        self._tournament_info_dict.update(cut_dict)

        # all other tournament data
        self._tournament_info_dict.update({
            'tournamentID': self.tournament_id,
            'tournamentName': self._pga_tournament,
            'multiCourse': findKeyInJSON(tournament_detail_json, 'multiCourse'),
            'totalRounds': findKeyInJSON(tournament_detail_json, 'totalRounds'),
            'format': findKeyInJSON(tournament_detail_json, 'format'),
            'pgaYear': findKeyInJSON(tournament_detail_json, 'year'),
            'status': findKeyInJSON(tournament_detail_json, 'roundState'),
            'playoff': findKeyInJSON(tournament_detail_json, 'playoffPresent'),
            'dates': self.web_driver.findElementByXPath('.//span[@class = "dates"]'),
            'location': self.web_driver.findElementByXPath('.//span[@class = "name"]')
        })

        # create player name dictionary
        player_rows = findKeyInJSON(tournament_detail_json, 'rows')
        for row in player_rows:
            self._player_meta_dict[row['playerId']] = {}
            self._player_meta_dict[row['playerId']]['firstName'] = row['playerNames']['firstName']
            self._player_meta_dict[row['playerId']]['lastName'] = row['playerNames']['lastName']

    def _scrapeCourseGeneral(self, course_general_json):
        """Insert into dictionaries from the general course information JSON"""

        for course_desc in course_general_json['courses']:
            course_id = findKeyInJSON(course_desc, 'number')
            self._course_general_dict[course_id] = {
                'description': findKeyInJSON(course_desc, 'body'),
                'name': findKeyInJSON(course_desc, 'name'),
                'totalYards': findKeyInJSON(course_desc, 'yards')
            }

    def _scrapePlayerDetail(self, main_player_id, round_num, round_detail_json):
        """Insert into dictionaries the data from the player round detail JSON"""

        """Scrape data from the player round specific JSON"""
        if main_player_id in self._player_round_dict and round_num in self._player_round_dict[main_player_id]:
            self._logger.info(
                'Previously downloaded JSON for round {} from player ID {}'.format(round_num, main_player_id))
            return

        self._logger.info('Downloading JSON from round {} for player ID {}'.format(round_num, main_player_id))
        course_id = findKeyInJSON(round_detail_json, 'courseId')
        # only add if course hasn't been added to course ids yet
        if course_id not in self._course_ids:
            # add course to wire requests
            self._course_requests[course_id] = self.template_wire_html_dict['course_detail'] \
                .replace('PGA_YEAR', self._pga_year) \
                .replace('TOURNAMENT_ID', self.tournament_id) \
                .replace('C_ID', course_id)
            self._course_ids.add(course_id)

        play_by_play = findKeyInJSON(round_detail_json, 'playersHoles')
        player_hole_dict = {}

        # get shot level data
        for hole in play_by_play:
            hole_id = hole['holeId']
            for player in hole['players']:
                player_id = player['playerId']
                if player_id not in player_hole_dict:
                    player_hole_dict[player_id] = {}
                player_hole_dict[player_id][hole_id] = player['shots']

        # check to see if main player id is indeed contained in json data
        if main_player_id not in player_hole_dict:
            self._logger.warning('Main Player ID is {}, player IDs in JSON File {}'.format(
                main_player_id, player_hole_dict.keys()))

        # assign shot data and create metadata for round
        for player_id in player_hole_dict.keys():
            if player_id not in self._player_round_dict:
                self._player_round_dict[player_id] = {}
            if round_num not in self._player_round_dict[player_id]:
                self._player_round_dict[player_id][round_num] = {}
            self._player_round_dict[player_id][round_num]['play-by-play'] = player_hole_dict[player_id]
            self._player_round_dict[player_id][round_num]['metadata'] = {
                'completedRound': findKeyInJSON(round_detail_json, 'roundComplete'),
                'groupId': findKeyInJSON(round_detail_json, 'groupId'),
                'startingHoleId': findKeyInJSON(round_detail_json, 'startingHoleId'),
                'courseId': findKeyInJSON(round_detail_json, 'courseId'),
                'playedWith': [other_id for other_id in player_hole_dict.keys() if other_id != player_id]
            }
        self._unsuccessful_player_round_scrape.pop(' '.join([main_player_id, round_num]), None)

    def _scrapeCourseDetail(self, c_id, course_detail_json):
        """Insert into dictionaries from the course detail JSON"""
        self._logger.info('Downloading JSON for course {}'.format(c_id))

        course_id = findKeyInJSON(course_detail_json, 'courseId')

        # check if this is a mismatch from c_id
        if c_id != course_id:
            self._logger.warning(
                'Course ID {} from course detail JSON mismatches the player round course ID {}'.format(course_id, c_id))

        # check if course exists from earlier general json scrape
        if len(self._course_ids) > 0 and course_id not in self._course_ids:
            self._logger.warning(
                'Course ID {} came through the wire but did not exist in the general course JSON'.format(course_id))

        self._course_ids.add(course_id)
        hole_detail_dict = {}

        # course_hole by course_hole data
        for hole in findKeyInJSON(course_detail_json, 'holes'):
            round_info = {'rounds': []}
            for round_details in hole['rounds']:
                round_detail = {
                    'round_Id': round_details['roundId'],
                    'distance': round_details['distance'],
                    'par': round_details['par'],
                    'stimp': round_details.get('stimp')
                }
                round_info['rounds'].append(round_detail)
            hole_detail_dict[hole['holeId']] = round_info

        # add metadata
        self._course_meta_dict[course_id] = {
            'courseCode': findKeyInJSON(course_detail_json, 'courseCode'),
            'parIn': findKeyInJSON(course_detail_json, 'parIn'),
            'parOut': findKeyInJSON(course_detail_json, 'parOut'),
            'parTotal': findKeyInJSON(course_detail_json, 'parTotal'),
            'holes': hole_detail_dict
        }
        # add data from course general dict if exists
        if course_id in self._course_general_dict:
            self._course_meta_dict[course_id].update(self._course_general_dict[course_id])

    def _getTournamentJSON(self, req_str):
        """Get tournament details from the JSON request string, rerun scrape if this isn't working"""
        tournament_detail_json = self.web_driver.wireRequestToJSON(req_str)
        if tournament_detail_json:
            self._scrapeTournamentJSON(tournament_detail_json)
            return True
        else:
            return False

    def _getCourseGeneralJSON(self, req_str):
        """Get course general details from the JSON request string"""
        course_general_json = self.web_driver.wireRequestToJSON(req_str)
        if course_general_json:
            self._scrapeCourseGeneral(course_general_json)

    def _getPlayerLevelJSON(self, req_str, main_player_id, round_num):
        """Get player level details from the JSON request string"""
        round_detail_json = self.web_driver.wireRequestToJSON(req_str)
        if round_detail_json:
            self._scrapePlayerDetail(main_player_id, round_num, round_detail_json)
            return True
        else:
            return False

    def _getCourseDetailJSON(self):
        """Get course details from the JSON request string"""
        for c_id, req_str in self._course_requests.items():
            course_detail_json = self.web_driver.wireRequestToJSON(req_str)
            if course_detail_json:
                self._scrapeCourseDetail(c_id, course_detail_json)

    def _getTournamentID(self):
        """Get tournament ID from Xpath"""
        tournament_xpath = self.web_driver.webDriverWait(self.web_driver.getDriver(),
                                                         EC.presence_of_element_located(
                                                             (By.XPATH,
                                                              "//meta[@name='branch:deeplink:tournament_id']")),
                                                         'Error getting tournament_id\n{}')
        if tournament_xpath is None:
            self._logger.error('Could not get a tournament ID out of {}\n'.format(tournament_xpath))
            return False
        self.tournament_id = re.findall(r'\d+', tournament_xpath.get_attribute('content'))[0]

        if not self.tournament_id:
            self._logger.error('Could not get a tournament ID out of string {}\n'.format(self.tournament_id))
            return False

        self._logger.info('Tournament ID is {}'.format(self.tournament_id))
        return True

    def _scrapeThroughPlayerRow(self, row):
        """Each player row will need to be clicked and then each round will need to show play by play data"""

        player_reqs = []
        # get player's shot information chart open on url
        _ = row.location_once_scrolled_into_view
        main_player_id = re.findall(r'\d+', row.get_attribute('class'))[0]
        player_name_col_button = self.web_driver.webDriverWait(row,
                                                               EC.element_to_be_clickable(
                                                                   (By.CLASS_NAME, 'player-name-col')),
                                                               'Error getting player column to click\n{}')
        if player_name_col_button is None:
            return player_reqs
        _ = player_name_col_button.location_once_scrolled_into_view
        player_name_col_button.click()

        # get the player drawer that opens
        player_drawer = self.web_driver.webDriverWait(row.parent,
                                                      EC.visibility_of_element_located(
                                                          (By.ID, 'playerDrawer{}'.format(main_player_id))),
                                                      'Error getting player drawer\n{}')
        if player_drawer is None:
            return player_reqs
        # get round by round data by clicking player round buttons
        round_selector = self.web_driver.webDriverWait(player_drawer,
                                                       EC.visibility_of_element_located(
                                                           (By.CLASS_NAME, 'round-selector')),
                                                       'Error getting round selector\n{}')
        if round_selector is None:
            return player_reqs

        last_round = round_selector.find_element_by_class_name('round.active').text
        self.web_driver.webDriverWait(round_selector,
                                      EC.element_to_be_clickable(
                                          (By.CLASS_NAME, 'round')),
                                      'Error getting round button to click\n{}')
        rounds = round_selector.find_elements_by_class_name('round')

        # go round by round to scrape data
        for round_button in rounds:
            round_num = round_button.text
            if main_player_id in self._player_round_dict and round_num in self._player_round_dict[main_player_id]:
                self._logger.info(
                    'Previously scraped data for round {} from player ID {}'.format(round_num, main_player_id))
                continue

            self._logger.info('Getting JSON wire for round {} from player ID {}'.format(round_num, main_player_id))
            player_reqs.append(
                {'PlayerID': main_player_id,
                 'RoundNum': round_num,
                 'Wire':
                     self.template_wire_html_dict['round_detail']
                         .replace('PGA_YEAR', self._pga_year)
                         .replace('TOURNAMENT_ID', self.tournament_id)
                         .replace('ROUND_NUM', round_num)
                         .replace('MAIN_PLAYER_ID', main_player_id)})

            if round_num != last_round:
                self.web_driver.getDriver().implicitly_wait(.1)
                round_button.click()

        # this closes the player's shot information chart
        # player_name_col_button.click()
        return player_reqs

    def _checkScrapeResults(self):
        """After getting all JSON and converting to dictionaries, check to see how we did"""
        if len(self._player_round_dict) == len(self._player_meta_dict):
            self.successfully_scraped = 100
            self._logger.info('Successfully scraped data for all players in tournament {} {}'
                              .format(self._pga_year, self._pga_tournament))
        elif len(self._player_round_dict) == 0:
            self._logger.info(
                'Unsuccessfully scraped data for tournament {} {}'.format(self._pga_year, self._pga_tournament))
        elif len(self._player_round_dict) < len(self._player_meta_dict):
            self.successfully_scraped = (len(self._player_round_dict) / len(self._player_meta_dict)) * 100
            self._logger.info('Only scraped data for {:.2f}% of players in tournament {} {}'.
                              format(self.successfully_scraped,
                                     self._pga_year,
                                     self._pga_tournament))
            self._logger.info(
                'Player rows unsuccessfully scraped are:\n{}'.format(self._unsuccessful_player_round_scrape.keys()))

    def runScrape(self):
        """Main function for running the scrape, get all necessary info from the page, iterate through
        players shot charts, try to scrape as much as possible from the JSON requests."""
        self._logger.info(
            '\nRunning Scrape for {} {}\nURL is {}\n'.format(self._pga_year, self._pga_tournament,
                                                             self._tournament_url))
        self.web_driver.goToURL(self._tournament_url)
        if not self._getTournamentID():
            return False

        row_lines = self.web_driver.webDriverWait(self.web_driver.getDriver(),
                                                  EC.visibility_of_all_elements_located(
                                                      (By.CSS_SELECTOR, 'tr.line-row.line-row')),
                                                  'Error locating player elements on page\n{}')
        if row_lines is None:
            return False

        # request string for tournament detail
        tournament_req_str = self.template_wire_html_dict['tournament_detail'].replace(
            'PGA_YEAR', self._pga_year).replace('TOURNAMENT_ID', self.tournament_id)
        # scrape JSON of tournament detail
        if not self._getTournamentJSON(tournament_req_str):
            self._logger.error('Failed getting tournament details.')
            return False

        # request string for course general info
        course_gen_req_str = self.template_wire_html_dict['course_general'].replace(
            'TOURNAMENT_ID', self.tournament_id)
        # scrape JSON of course general
        self._getCourseGeneralJSON(course_gen_req_str)

        successive_failures = 0
        # split up player JSON requests because some data overlaps in the play by play JSON
        for i in range(3):
            remove_rows = []
            # run first time through and keep track of unsuccessful scrapes
            for row_num, row in enumerate(row_lines[i::3]):
                row_num = i + (row_num * 3)
                # if row_num > 9:
                #     continue
                if row_num not in self._row_dict:
                    self._logger.info('Iterating over row {}'.format(row_num))
                    self._row_dict[row_num] = self._scrapeThroughPlayerRow(row)
            for row_num, player_requests in self._row_dict.items():
                for request in player_requests:
                    req_str = request['Wire']
                    main_player_id = request['PlayerID']
                    round_num = request['RoundNum']
                    if not self._getPlayerLevelJSON(req_str, main_player_id, round_num):
                        self._unsuccessful_player_round_scrape[' '.join([main_player_id, round_num])] = req_str
                        self._logger.warning(
                            'Unsuccessfully retrieved JSON for player ID {} -- round '
                            'number {}. Will retry this row later.\n'.format(main_player_id, round_num))
                        successive_failures += 1
                        break
                    else:
                        successive_failures = 0
                else:
                    remove_rows.append(row_num)

                # Something's wrong
                if successive_failures > 5:
                    self._logger.warn(
                        'Had 5 successive failures while getting player round JSON, exiting scrape')
                    return False
            # remove successful rows
            for row_num in remove_rows:
                del self._row_dict[row_num]

        # can get course detail data once all players have been added with the courses they played
        self._getCourseDetailJSON()

        # run through a second time with all the rows that were unsuccessful at first
        for row_num in self._row_dict.keys():
            self._logger.info('Iterating over row {}'.format(row_num))
            self._row_dict[row_num] = self._scrapeThroughPlayerRow(row_lines[row_num])
        for row_num, player_requests in self._row_dict.items():
            for request in player_requests:
                req_str = request['Wire']
                main_player_id = request['PlayerID']
                round_num = request['RoundNum']
                if not self._getPlayerLevelJSON(req_str, main_player_id, round_num):
                    self._logger.warning(
                        'Unsuccessfully retrieved JSON for player ID {} -- round '
                        'number {} Final attempt.\n'.format(main_player_id, round_num))

        self._checkScrapeResults()
        return True

    def __convertPlayerRoundToMongoDBCollection(self):
        player_round_collection = []
        for player_id, round_num in self._player_round_dict.items():
            for round_key, round_values in round_num.items():
                player_round_level = {'playerID': player_id, 'roundNumber': round_key,
                                      'tournamentID': self.tournament_id, 'pgaYear': self._pga_year}
                player_round_level.update(round_values['metadata'])
                player_round_level['holes'] = []
                for hole_key, hole_values in round_values['play-by-play'].items():
                    hole_level = {'holeNumber': hole_key, 'shots': []}
                    for shot in hole_values:
                        hole_level['shots'].append(shot)
                    player_round_level['holes'].append(hole_level)
                player_round_collection.append(player_round_level)
        return player_round_collection

    def __convertPlayerMetaToMongoDBCollection(self):
        player_meta_collection = []
        for player_id, meta_values in self._player_meta_dict.items():
            player_meta = {'playerID': player_id}
            player_meta.update(meta_values)
            player_meta_collection.append(player_meta)
        return player_meta_collection

    def __convertCourseMetaToMongoDBCollection(self):
        course_meta_collection = []
        for course_id, course_details in self._course_meta_dict.items():
            course_meta = {'courseID': course_id, 'pgaYear': self._pga_year, 'tournamentID': self.tournament_id}
            course_meta.update(course_details)
            hole_level_list = []
            for hole_key, round_info in course_meta['holes'].items():
                hole_level = {'holeNumber': hole_key}
                hole_level.update(round_info)
                hole_level_list.append(hole_level)
            course_meta['holes'] = hole_level_list
            course_meta_collection.append(course_meta)
        return course_meta_collection

    def convertDictsToMongoDBCollection(self):
        """General method for converting all class dictionaries to MongoDB Collections"""
        mongoDB_collections = [self.__convertPlayerRoundToMongoDBCollection(),
                               self.__convertPlayerMetaToMongoDBCollection(),
                               self.__convertCourseMetaToMongoDBCollection(), self._tournament_info_dict]
        return mongoDB_collections

    def uploadDictsToJSON(self):
        """Upload the dictionaries to json files for debugging purposes"""
        with open(self.dir + 'player_round.json', 'w') as f:
            json.dump(self._player_round_dict, f)
        with open(self.dir + 'player_meta.json', 'w') as f:
            json.dump(self._player_meta_dict, f)
        with open(self.dir + 'tournament_info.json', 'w') as f:
            json.dump(self._tournament_info_dict, f)
        with open(self.dir + 'course_meta.json', 'w') as f:
            json.dump(self._course_meta_dict, f)

    def downloadDictsFromJSON(self):
        """Download the JSON files to dictionaries for debugging purposes"""
        with open(self.dir + 'player_round.json', 'r') as f:
            self._player_round_dict = json.load(f)
        with open(self.dir + 'player_meta.json', 'r') as f:
            self._player_meta_dict = json.load(f)
        with open(self.dir + 'tournament_info.json', 'r') as f:
            self._tournament_info_dict = json.load(f)
        with open(self.dir + 'course_meta.json', 'r') as f:
            self._course_meta_dict = json.load(f)
import itertools
import logging

import pandas as pd

from Logging.MyLogger import MyLogger
from MongoDB.MongoDownload import MongoDownload
from MongoDB.MongoInitialization import MongoInitialization
from TournamentRun import TournamentRun

# tournaments_path = 'tournaments/FailedTournamentList.csv'
tournaments_path = 'tournaments/TournamentList.csv'

if __name__ == '__main__':
    max_drivers = 2
    main_logger = MyLogger('Main', 'Main/logs/main.log', logging.INFO).getLogger()
    mongo_obj = MongoInitialization('scraper')
    tournament_df = pd.read_csv(tournaments_path, delimiter=',')
    tournament_df.columns = tournament_df.columns.str.strip()
    tournament_df['Name'] = tournament_df['Name'].str.strip()
    mongo_download = MongoDownload(mongo_obj)
    tournaments_scraped = mongo_download.getTournamentsScraped()
    filter_tournaments = tournament_df[~tournament_df[['Name', 'Year']].apply(tuple, 1).isin(tournaments_scraped)]
    tournaments = filter_tournaments.apply(lambda row: TournamentRun(row[0], row[1], mongo_obj, main_logger),
                                           axis=1).tolist()
    iter_tournaments = iter(tournaments)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_drivers) as executor:
        # Only schedule max_drivers amount of futures to start
        futures = {
            executor.submit(tournament.runTournament, None, True): tournament
Esempio n. 13
0
class MongoUploadDF:
    def __init__(self, tournament_db, tournament_name):
        """For uploading tournament DF to MongoDB"""
        self._tournament_db = tournament_db
        self._name = tournament_name
        self._logger = MyLogger(
            'MongoDB Tournament DF {}'.format(self._name),
            'tournaments/DFs/{}/logs/tournament_mongodb.log'.format(
                self._name), logging.INFO).getLogger()
        self._tournament_df_upload = False
        self._raw_sg_df_upload = False

    def __repr__(self):
        return 'MongoDB DF Upload Status: {}'.format(self._getUploadStatus())

    def uploadTournamentDF(self, upload_dict):
        try:
            tournament_name = upload_dict['tournamentName']
            pga_year = upload_dict['pgaYear']
            course_id = upload_dict['courseID']
            round_num = upload_dict['roundNum']
            self._logger.info(
                'Attempting to upload {} {}, course {}, round #{}'.format(
                    pga_year, tournament_name, course_id, round_num))
            query = {
                'tournamentName': tournament_name,
                'courseID': course_id,
                'pgaYear': pga_year,
                'roundNum': round_num
            }
            values = {'$set': upload_dict}
            result = self._tournament_db.tournament_df.update(query,
                                                              values,
                                                              upsert=True)
            if result is not None:
                if not result['updatedExisting']:
                    self._logger.info(
                        'Inserted Tournament DF into collection with id {}\n'.
                        format(result['upserted']))
                else:
                    self._logger.info(
                        'Updated existing Tournament DF with key {}\n'.format({
                            'tournamentName':
                            tournament_name,
                            'courseID':
                            course_id,
                            'pgaYear':
                            pga_year,
                            'roundNum':
                            round_num
                        }))
        except Exception as e:
            self._logger.error('Problem uploading DF {}'.format(e),
                               exc_info=True)
        else:
            self._tournament_df_upload = True

    def uploadRawSG_DF(self, upload_dict):
        try:
            tournament_name = upload_dict['tournamentName']
            pga_year = upload_dict['pgaYear']
            self._logger.info('Attempting to upload {} {}'.format(
                pga_year, tournament_name))
            query = {'tournamentName': tournament_name, 'pgaYear': pga_year}
            values = {'$set': upload_dict}
            result = self._tournament_db.raw_sg_df.update(query,
                                                          values,
                                                          upsert=True)
            if result is not None:
                if not result['updatedExisting']:
                    self._logger.info(
                        'Inserted Raw SG DF into collection with id {}\n'.
                        format(result['upserted']))
                else:
                    self._logger.info(
                        'Updated existing Raw SG DF with key {}\n'.format({
                            'tournamentName':
                            tournament_name,
                            'pgaYear':
                            pga_year
                        }))
        except Exception as e:
            self._logger.error('Problem uploading DF {}'.format(e),
                               exc_info=True)
        else:
            self._raw_sg_df_upload = True

    def _getUploadStatus(self):
        return 'Tournament DF Upload: {}\nRaw SG DF Upload: {}\n'.format(
            self._tournament_df_upload, self._raw_sg_df_upload)
Esempio n. 14
0
class MongoUploadTournament:
    def __init__(self, tournament_db, tournament_year, tournament_name):
        """For uploading tournament scrape collection objects to MongoDB"""
        self._tournament_db = tournament_db
        self._year = tournament_year
        self._name = tournament_name
        self._logger = MyLogger(
            'MongoDB {} {}'.format(self._year, self._name),
            'tournaments/{}_{}/logs/tournament_mongodb.log'.format(
                self._year, self._name), logging.INFO).getLogger()
        self._tournament_detail_upload = False
        self._player_metadata_upload = 0
        self._player_metadata_overall = 0
        self._player_round_upload = 0
        self._player_round_overall = 0
        self._course_metadata_upload = 0
        self._course_metadata_overall = 0
        self._tournament_scrape_status_upload = False
        self._sg_stats_upload = False

    def __repr__(self):
        return 'MongoDB Tournament Upload Status: {}'.format(
            self._getUploadStatus())

    def uploadTournamentDetails(self, tournament_details):
        result = self._tournament_db.tournament_detail.replace_one(
            {
                'tournamentID': tournament_details['tournamentID'],
                'pgaYear': tournament_details['pgaYear']
            },
            tournament_details,
            upsert=True)
        if result is not None:
            if result.upserted_id is not None:
                self._logger.info(
                    'Inserted tournament details into collection with id {}\n'.
                    format(result.upserted_id))
            else:
                self._logger.info(
                    'Updated existing tournament details with key {}\n'.format(
                        {
                            'tournamentID': tournament_details['tournamentID'],
                            'pgaYear': tournament_details['pgaYear']
                        }))
            self._tournament_detail_upload = True

    def uploadPlayerMetadata(self, player_metadata):
        for player in player_metadata:
            self._player_metadata_overall += 1
            if self._tournament_db.player_metadata.find_one(
                {"playerID": player['playerID']}) is None:
                result = self._tournament_db.player_metadata.insert_one(player)
                if result is not None:
                    self._logger.info(
                        'Inserted player metadata into collection with id {}\n'
                        .format(result.inserted_id))
                    self._player_metadata_upload += 1

    def uploadPlayerRounds(self, player_rounds):
        for player in player_rounds:
            self._player_round_overall += 1
            result = self._tournament_db.player_round.replace_one(
                {
                    'playerID': player['playerID'],
                    'tournamentID': player['tournamentID'],
                    'pgaYear': player['pgaYear'],
                    'roundNumber': player['roundNumber']
                },
                player,
                upsert=True)
            if result is not None:
                if result.upserted_id is not None:
                    self._logger.info(
                        'Inserted player rounds into collection with id {}\n'.
                        format(result.upserted_id))
                else:
                    self._logger.info(
                        'Updated existing player rounds with key {}\n'.format({
                            'playerID':
                            player['playerID'],
                            'tournamentID':
                            player['tournamentID'],
                            'pgaYear':
                            player['pgaYear'],
                            'roundNumber':
                            player['roundNumber']
                        }))
                self._player_round_upload += 1

    def uploadCourseMetadata(self, course_metadata):
        for course in course_metadata:
            self._course_metadata_overall += 1
            result = self._tournament_db.course_metadata.replace_one(
                {
                    'courseID': course['courseID'],
                    'tournamentID': course['tournamentID'],
                    'pgaYear': course['pgaYear']
                },
                course,
                upsert=True)
            if result is not None:
                if result.upserted_id is not None:
                    self._logger.info(
                        'Inserted course metadata into collection with id {}\n'
                        .format(result.upserted_id))
                else:
                    self._logger.info(
                        'Updated existing course metadata with key {}\n'.
                        format({
                            'courseID': course['courseID'],
                            'tournamentID': course['tournamentID'],
                            'pgaYear': course['pgaYear']
                        }))
                self._course_metadata_upload += 1

    def uploadTournamentScrapeStatus(self, scrape_status):
        result = self._tournament_db.tournament_scrape_status.replace_one(
            {
                'tournamentID': scrape_status['tournamentID'],
                'pgaYear': scrape_status['pgaYear']
            },
            scrape_status,
            upsert=True)
        if result is not None:
            if result.upserted_id is not None:
                self._logger.info(
                    'Inserted tournament scrape status into collection with id {}\n'
                    .format(result.upserted_id))
            else:
                self._logger.info(
                    'Updated existing tournament scrape status with key {}\n'.
                    format({
                        'tournamentID': scrape_status['tournamentID'],
                        'pgaYear': scrape_status['pgaYear']
                    }))
            self._tournament_scrape_status_upload = True

    def _getUploadStatus(self):
        return '{} {}\n'.format(self._year, self._name) + \
               'Tournament Details Uploaded: {}\n'.format(self._tournament_detail_upload) + \
               'Player Metadata Uploaded: {} new players of {} total players\n'.format(self._player_metadata_upload,
                                                                                       self._player_metadata_overall) \
               + \
               'Player Rounds Uploaded: {} of {} possible\n'.format(self._player_round_upload,
                                                                    self._player_round_overall) + \
               'Course Metadata Uploaded: {} of {} possible\n'.format(self._course_metadata_upload,
                                                                      self._course_metadata_overall) + \
               'Tournament Scrape Status Uploaded: {}\n'.format(self._tournament_scrape_status_upload)
Esempio n. 15
0
class dfHandler:
    max_hole_dist = 700
    max_green_dist = 50
    arg_green_dist = 30
    long_putt_dist = 12
    adv_pct = .5
    pd.set_option('display.max_columns', None)

    @staticmethod
    def getNameAbbr(row):
        return '. '.join([row.firstName[0], row.lastName])

    @staticmethod
    def getShotType(row):
        if row.shotDistance == 0:
            val = 'Penalty'
        elif row.fromSurface == 'OTB' and row.par in [4, 5]:
            val = 'TEE'
        elif row.fromSurface in ['OGR', 'OCO']:
            if row.startDistance > dfHandler.long_putt_dist * 12:
                if row.fromSurface == 'OCO':
                    val = 'ARG'
                else:
                    val = 'LNG_PUTT'
            else:
                val = 'SHT_PUTT'
        elif row.fromSurface in ['OFW', 'ORO', 'OST', 'OIR', 'ONA', 'OTH', 'OTB', 'OWL', 'OBR', 'OWA'] \
                and row.startDistance > (36 * dfHandler.arg_green_dist):
            val = 'APP'
        elif row.fromSurface in ['OFW', 'ORO', 'OST', 'OIR', 'ONA', 'OTH', 'OGS', 'OWL', 'OBR', 'OWA']:
            val = 'ARG'
        else:
            print('Unidentified from val {}'.format(row.fromSurface))
            val = 'Unknown'
        return val

    @staticmethod
    def getEndLocation(row):
        if row.to in ['ELI', 'ELF', 'ELR', 'EG5', 'EG6', 'EG7']:
            direction = 'Left'
        elif row.to in ['ERI', 'ERF', 'ERR', 'EG2', 'EG1', 'EG3']:
            direction = 'Right'
        elif row.toSurface == 'Penalty':
            direction = 'Penalty'
        else:
            direction = ''

        if row.to == 'OGR':
            val = 'Green'
        elif row.to == 'hole':
            val = 'Hole'
        elif row.to in ['ELF', 'ERF', 'ERI', 'ELI', 'OFW', 'OIR', 'OCO']:
            val = 'Fairway'
        elif row.to in ['ERR', 'ELR', 'ORO', 'OCA', 'OWL', 'OBR']:
            val = 'Rough'
        elif row.to in ['OST', 'EG2', 'EG5', 'EG6', 'EG1', 'EG4', 'EG3', 'EG7', 'OGS', 'EG8']:
            val = 'Bunker'
        elif row.to in ['ONA', 'OTH', 'OUK', 'OTB']:
            val = 'Trouble'
        elif row.to == 'OWA':
            val = 'Water'
        else:
            print('Unidentified to val {}'.format(row.to))
            val = 'Unknown'
        return direction, val

    @staticmethod
    def getDateTimes(dates_str):
        dates, year = dates_str.strip().split(',')
        first_day, last_day = dates.strip().split('-')
        return datetime.strptime('{} {}'.format(first_day.strip(), year), '%A %b %d %Y'), datetime.strptime(
            '{} {}'.format(last_day.strip(), year), '%A %b %d %Y')

    @staticmethod
    def getQuantiles(df, grouping='shotType', cut_on='distanceLeft'):
        shot_types = df.groupby(by=grouping)
        for name, group in shot_types:
            quantile = 20
            for i in range(20):
                if (group[cut_on].count() <= quantile) or \
                        (len(np.unique(
                            np.quantile(group[cut_on], np.linspace(0, 1, quantile, endpoint=False)))) < quantile):
                    quantile -= 1
                else:
                    break

            pct_labels = []
            for x in np.linspace(0, 100, quantile, endpoint=False):
                pct_labels.append('({:.2f}% to {:.2f}%]'.format(x, x + 100 / quantile))
            pct_labels.reverse()
            df['distanceLeftQuantileBin{}'.format(name)] = pd.qcut(group[cut_on],
                                                                   q=quantile,
                                                                   precision=0,
                                                                   labels=pct_labels)
        return df

    @staticmethod
    def getBinValues(cut_on, end_bin, interval, yds_or_feet):
        if yds_or_feet == 'ft':
            multiplier = 3
        else:
            multiplier = 1

        labels = []
        for x in range(0, end_bin * multiplier, interval):
            labels.append('({} to {}] {}'.format(x, x + interval, yds_or_feet))
        return pd.cut(x=cut_on,
                      bins=np.linspace(0, end_bin * 36,
                                       int((end_bin * multiplier) / interval) + 1),
                      precision=0,
                      labels=labels,
                      include_lowest=True,
                      right=True)

    @staticmethod
    def createHoleLevelDict(tournament_year_dict):
        year_course_hole_round = {}
        for pga_year in tournament_year_dict.keys():
            dates_str = tournament_year_dict[pga_year]['dates']
            first_dt, last_dt = dfHandler.getDateTimes(dates_str)

            course_dict = {}
            for course in tournament_year_dict[pga_year]['courses']:
                hole_based_dict = {}
                course_id = course['courseID']
                for course_hole in course['holes']:
                    hole_based_dict[course_hole['holeNumber']] = {}
                    for i, round_info in enumerate(course_hole['rounds']):
                        hole_based_dict[course_hole['holeNumber']][round_info['round_Id']] \
                            = {k: round_info[k] for k in round_info if k != 'round_Id'}
                        hole_based_dict[course_hole['holeNumber']][round_info['round_Id']].update(
                            {'roundDate': first_dt + timedelta(days=i),
                             'playerShots': {}})
                for player_round in tournament_year_dict[pga_year]['playerRounds']:
                    if course_id != player_round['courseId']:
                        continue
                    for player_hole in player_round['holes']:
                        hole_based_dict[player_hole['holeNumber']][player_round['roundNumber']][
                            'playerShots'][player_round['playerID']] = player_hole['shots']
                course_dict[course_id] = hole_based_dict
            year_course_hole_round[pga_year] = course_dict
        return year_course_hole_round

    def __init__(self, mongo_obj, tournament_name_scrape, tournament_name_sg, force_create_sg=False,
                 force_create_tournament=False):
        self._logger = MyLogger('dfHandler', 'Analysis/logs/dfHandler.log', logging.INFO).getLogger()
        self._tournament_name = tournament_name_scrape
        self._mongo_obj = mongo_obj
        mongo_download = MongoDownload(self._mongo_obj)
        self._mongo_upload_df = MongoUploadDF(self._mongo_obj.getTournamentDB(), self._tournament_name)
        self._raw_sg_df = pd.DataFrame(mongo_download.getRawSG_DF(tournament_name_scrape))
        if self._raw_sg_df.empty or force_create_sg:
            self._logger.info('Creating New Raw SG DF')
            self._raw_sg_df = pd.DataFrame()
            self._createRawSG_DF(mongo_download.getSGStatsForTournament(tournament_name_scrape, tournament_name_sg),
                                 mongo_download.getPlayerNames())

        self._tournament_df = pd.DataFrame(mongo_download.getTournamentDF(tournament_name_scrape))
        if self._tournament_df.empty or force_create_tournament:
            self._logger.info('Creating New Tournament DF')
            self._tournament_df = pd.DataFrame()
            self._createTournamentDF(mongo_download.consolidateTournamentInfo(tournament_name_scrape),
                                     mongo_download.getPlayerNames())

    def __repr__(self):
        success = True
        if self._tournament_df.empty:
            success = False
        return 'Tournament {} DF successfully created {}\n'.format(self._tournament_name, success)

    def _dfLogic(self, hole_df, year, course, hole_num, round_num):
        hole_df = hole_df.rename(columns={'distance': 'holeDistance'})
        hole_df['holeDistance'] = hole_df.holeDistance.astype(int) * 36
        hole_df['par'] = hole_df.par.astype(int)
        hole_df['stimp'] = hole_df.stimp.astype(np.float16)
        hole_df['roundDate'] = pd.to_datetime(hole_df.roundDate)
        hole_df['pgaYear'] = year
        hole_df['courseID'] = course
        hole_df['holeNum'] = hole_num
        hole_df['roundNum'] = round_num

        hole_df = hole_df[hole_df.playerShots.map(lambda l: len(l)) > 0]
        hole_df = hole_df.explode('playerShots')
        temp_df = pd.json_normalize(hole_df.playerShots)
        hole_df = pd.concat([hole_df.reset_index().drop(columns='playerShots'), temp_df], axis=1)
        del temp_df
        hole_df = hole_df.rename(columns={'distance': 'shotDistance',
                                          'from': 'fromSurface',
                                          'left': 'distanceLeft',
                                          'index': 'playerID'})
        hole_df['startDistance'] = np.nan
        hole_df.loc[hole_df.fromSurface == 'OTB', 'startDistance'] = hole_df.holeDistance
        hole_df.drop(columns='holeDistance', inplace=True)
        hole_df['startDistance'] = hole_df.startDistance.fillna(value=hole_df.distanceLeft.shift(1))
        player_group = hole_df.groupby(by='playerID', group_keys=False)
        hole_df = hole_df[player_group.apply(lambda x: x.shot_id != x.shot_id.shift(1))]
        player_group = hole_df.groupby(by='playerID')
        hole_df['playerScore'] = player_group.shot_id.transform('max')
        hole_df['holeAvg'] = player_group.shot_id.max().mean()
        hole_df['shotsRemaining'] = player_group.cumcount(ascending=False)
        hole_df['shotType'] = hole_df.apply(dfHandler.getShotType, axis=1)
        hole_df['isAdvanced'] = (hole_df['shotType'] == 'APP') & \
                                (hole_df.distanceLeft > (self.adv_pct * hole_df.startDistance))
        hole_df['toSurface'] = hole_df.shotType.shift(-1)
        hole_df[['toLocation', 'toSurface']] = hole_df.apply(dfHandler.getEndLocation, axis=1,
                                                             result_type='expand')
        hole_df.drop(hole_df[hole_df.shotType == 'Penalty'].index, inplace=True)
        hole_df.loc[hole_df.toLocation == 'Penalty', 'distanceLeft'] = \
            hole_df.startDistance.shift(-1).fillna(0)
        hole_df['isReTee'] = hole_df.apply(
            lambda x: x['startDistance'] == x['distanceLeft'] and x['shotType'] == 'TEE', axis=1)
        hole_df = self._getDistanceBins(hole_df)

        # self._logger.info('\nHole DF description\n{}'.
        #                   format(hole_df.describe(percentiles=[.5]).T))
        return hole_df

    def _getDistanceBins(self, hole_df):
        hole_df.loc[hole_df['shotType'] == 'TEE', 'startDistance10ydBin'] = dfHandler.getBinValues(
            hole_df[hole_df['shotType'] == 'TEE'].startDistance, self.max_hole_dist, 10, 'yds')
        hole_df.loc[(hole_df['shotType'] == 'TEE') | (hole_df['shotType'] == 'APP'), 'distanceLeft5ydBin'] = \
            dfHandler.getBinValues(hole_df[(hole_df['shotType'] == 'TEE') | (hole_df['shotType'] == 'APP')].
                                   distanceLeft, self.max_hole_dist, 5, 'yds')
        hole_df.loc[hole_df['shotType'] == 'APP', 'distanceLeft1ydBin'] = \
            dfHandler.getBinValues(hole_df[hole_df['shotType'] == 'APP'].
                                   distanceLeft, self.max_green_dist, 1, 'yd')
        hole_df.loc[hole_df['shotType'] != 'TEE', 'distanceLeft1ftBin'] = dfHandler.getBinValues(
            hole_df[hole_df['shotType'] != 'TEE'].distanceLeft, self.max_green_dist, 1, 'ft')
        return hole_df

    def _createTournamentDF(self, tournament_year_dict, player_names):
        year_course_hole_round = dfHandler.createHoleLevelDict(tournament_year_dict)
        for year in year_course_hole_round.keys():
            for course in year_course_hole_round[year].keys():
                for hole_num in year_course_hole_round[year][course].keys():
                    for round_num in year_course_hole_round[year][course][hole_num].keys():
                        if not year_course_hole_round[year][course][hole_num][round_num]['playerShots']:
                            continue
                        self._logger.info(
                            'Creating hole level DF for tournament {}, year {}, course {}, hole {}, round {}\n'
                                .format(self._tournament_name, year, course, hole_num, round_num))
                        hole_df = pd.DataFrame.from_dict(
                            year_course_hole_round[year][course][hole_num][round_num])
                        self._tournament_df = self._tournament_df.append(
                            self._dfLogic(hole_df, year, course, hole_num, round_num))
        player_name_df = pd.DataFrame(player_names)
        self._tournament_df = pd.merge(self._tournament_df, player_name_df, on='playerID', how='left')
        self._tournament_df.reset_index()

    def _createRawSG_DF(self, sg_dict, player_names):
        player_name_df = pd.DataFrame(player_names)
        # noinspection PyTypeChecker
        player_name_df['playerName'] = player_name_df.apply(dfHandler.getNameAbbr, axis=1)
        for year in sg_dict.keys():
            self._raw_sg_df = self._raw_sg_df.append(sg_dict[year]['sgStats'])
        numeric_cols = ['sgPUTT', 'sgARG', 'sgAPP', 'sgOTT', 'sgT2G', 'sgTOT']
        self._raw_sg_df[numeric_cols] = self._raw_sg_df[numeric_cols].apply(pd.to_numeric, axis=1)
        self._raw_sg_df = pd.merge(self._raw_sg_df, player_name_df, on='playerName', how='left')
        self._raw_sg_df.drop(columns=['playerName', 'tournamentName'], inplace=True)

    def getTournamentDF(self):
        return self._tournament_df

    def getRawSG_DF(self):
        return self._raw_sg_df

    def uploadTournamentDF(self):
        for course, course_tournament_df in self._tournament_df.groupby('courseID'):
            for year, year_tournament_df in course_tournament_df.groupby('pgaYear'):
                for round_num, round_tournament_df in year_tournament_df.groupby('roundNum'):
                    df_dict = round_tournament_df.drop(columns='shottext').to_dict('records')
                    upload_dict = {'tournamentName': self._tournament_name, 'courseID': course,
                                   'pgaYear': year, 'roundNum': round_num, 'df': df_dict}
                    self._mongo_upload_df.uploadTournamentDF(upload_dict)

    def uploadRawSG_DF(self):
        for year, year_tournament_df in self._raw_sg_df.groupby('pgaYear'):
            df_dict = year_tournament_df.to_dict('records')
            upload_dict = {'tournamentName': self._tournament_name,
                           'pgaYear': year, 'df': df_dict}
            self._mongo_upload_df.uploadRawSG_DF(upload_dict)
Esempio n. 16
0
import logging
import pandas as pd

from Analysis.dfHandler import dfHandler
from Analysis.sgHandler import sgHandler
from Logging.MyLogger import MyLogger
from MongoDB.MongoInitialization import MongoInitialization

if __name__ == '__main__':
    analysis_logger = MyLogger('Analysis', 'Analysis/logs/hole_df.log',
                               logging.INFO).getLogger()
    mongo_init = MongoInitialization('df')
    df_handler = dfHandler(mongo_init, 'waste-management-phoenix-open',
                           'Waste Management Phoenix Open', False, False)
    tournament_df = df_handler.getTournamentDF()
    sg_df = df_handler.getRawSG_DF()
    # df_handler.uploadTournamentDF()
    # df_handler.uploadRawSG_DF()

    sg_handler = sgHandler(mongo_init, tournament_df, sg_df)
    sg_handler.applySGLogicToGroups(True)
    # sg_handler.getSGTee(False)

    sg_df_dict = sg_handler.getSG_DF_Dict()

    # combine = pd.merge(sg_df_dict['Tee']['RawSGMatch'], sg_df, how='left', on=['playerID', 'pgaYear'])
Esempio n. 17
0
class sgHandler:
    grouping_list = [('Course', ['courseID']), ('Year', ['pgaYear']),
                     ('Round', ['roundNum']), ('Hole', ['holeNum']),
                     ('YearRound', ['pgaYear', 'roundNum']),
                     ('YearHole', ['pgaYear', 'holeNum']),
                     ('YearRoundHole', ['pgaYear', 'roundNum', 'holeNum'])]
    pd.set_option('display.max_columns', None)

    @staticmethod
    def lowessExpectedShotsByDistance(distance_shots):
        new_df = distance_shots.str.split('/', expand=True)
        endog = new_df[[1]].values.ravel()
        exog = new_df[[0]].values.ravel()
        return pd.Series(lowess(endog=endog, exog=exog, return_sorted=False))

    @staticmethod
    def lmExpectedShotsByDistance(distance_shots):
        new_df = distance_shots.str.split('/', expand=True)
        lm = LinearRegression()
        lm.fit(new_df[[0]], new_df[[1]])
        return lm.predict(new_df[[0]]).flatten()

    @staticmethod
    def lmExpectedRemainingShotsGroup(df, group, name):
        df['lmExpectedShotsRemaining{}'.format(name)] = df.groupby(group)['distance/shots']. \
            transform(sgHandler.lmExpectedShotsByDistance)
        return df

    @staticmethod
    def lowessExpectedRemainingShotsColumn(df, group, name):
        df['lowessExpectedShotsRemaining{}'.format(name)] = df.groupby(group)['distance/shots']. \
            transform(sgHandler.lowessExpectedShotsByDistance)
        return df

    @staticmethod
    def fiveYdBinExpectedRemainingShotsColumn(df, group, name):
        bin_group = group + ['distanceLeft5ydBin']
        df['5ydBinAvgExpectedShotsRemaining{}'.format(name)] = df.groupby(
            bin_group)['shotsRemaining'].transform('mean')
        return df

    @staticmethod
    def oneFtBinExpectedRemainingShotsColumn(df, group, name):
        bin_group = group + ['distanceLeft1ftBin']
        df['1ftBinAvgExpectedShotsRemaining{}'.format(name)] = df.groupby(
            bin_group)['shotsRemaining'].transform('mean')
        return df

    @staticmethod
    def getGroupAveragesAndSGOverAvg(df, name, group, column_to_avg, sg_type,
                                     column_to_subtract):
        df['{}{}'.format(name, column_to_avg)] = \
            df.groupby(group).transform('mean')[column_to_avg]
        df['SG{}Over_{}'.format(sg_type, column_to_avg)] = df['{}{}'.format(name, column_to_avg)] - \
                                                           df[column_to_subtract]
        return df

    @staticmethod
    def getGroupSTDofSGOverAvg(df, name, group_by_cols, column_to_avg,
                               sg_type):
        df['{}STDofSG{}'.format(name, sg_type)] = \
            df.groupby(group_by_cols)['SG{}Over_{}'.format(sg_type, column_to_avg)].transform('std')
        df['NumSTDFromSG{}Over_{}'.format(sg_type, column_to_avg)] = \
            abs(df['SG{}Over_{}'.format(sg_type, column_to_avg)] / df['{}STDofSG{}'.format(name, sg_type)])
        return df

    # @staticmethod
    # def createSGTeeColumns(df, name):
    #     df['SGTeeOverLM{}'.format(name)] = df['lmExpectedShotsRemaining{}'.format(name)] - df['shotsRemaining']
    #     df['SGTeeOverLowess{}'.format(name)] = df['lowessExpectedShotsRemaining{}'.format(name)] - df[
    #     'shotsRemaining']
    #     df['SGTeeOverBinAvg{}'.format(name)] = df['5ydBinAvgExpectedShotsRemaining{}'.format(name)] - \
    #                                            df['shotsRemaining']
    #     return df

    # @staticmethod
    # def visualizeDistanceLeft(df, title):
    #     _ = sns.lmplot(data=df, x='distanceLeft', y='shotsRemaining', hue='toSurface')
    #     plt.title(title + ' LM')
    #     plt.show()
    #     _ = sns.lmplot(data=df, x='distanceLeft', y='shotsRemaining', hue='toSurface', lowess=True)
    #     plt.title(title + ' Lowess')
    #     plt.show()
    #     distance_grouped = df.groupby(['distanceLeft5ydBin', 'toSurface']).mean().reset_index()
    #     _ = sns.scatterplot(data=distance_grouped, x='distanceLeft', y='shotsRemaining', hue='toSurface')
    #     plt.title(title + ' 5ydBin')
    #     plt.show()

    # @staticmethod
    # def visualizeStartDistance(df, group, title):
    #     _ = sns.lmplot(data=df, x='startDistance', y='shotsTaken', hue=group)
    #     plt.title(title + ' LM')
    #     plt.show()
    #     _ = sns.lmplot(data=df, x='startDistance', y='shotsTaken', hue=group, lowess=True)
    #     plt.title(title + ' Lowess')
    #     plt.show()

    @staticmethod
    def getStartingExpectedShots(tee_shots_df, visualize):
        tee_shots_df['shotsTaken'] = tee_shots_df['shotsRemaining'] + 1
        tee_shots_df['distance/shots'] = tee_shots_df.apply(
            lambda x: str(x['startDistance']) + '/' + str(x['shotsTaken']),
            axis=1)
        tee_shots_df['lmExpectedShotsStarting'] = tee_shots_df['distance/shots']. \
            transform(sgHandler.lmExpectedShotsByDistance)
        if visualize:
            _ = sns.lmplot(data=tee_shots_df,
                           x='startDistance',
                           y='shotsTaken',
                           lowess=True)
            plt.title('Expected Shots From Start Distance Lowess Model')
            plt.show()
        return tee_shots_df

    @staticmethod
    def getRemainingExpectedShots(tee_shots_df, visualize=False):
        tee_shots_df['distance/shots'] = tee_shots_df.apply(
            lambda x: str(x['distanceLeft']) + '/' + str(x['shotsRemaining']),
            axis=1)

        no_retee_df = tee_shots_df[~tee_shots_df['isReTee']].copy()
        tee_shots_df['lmExpectedShotsRemainingBySurface'] = no_retee_df.groupby('toSurface')['distance/shots']. \
            transform(sgHandler.lmExpectedShotsByDistance)
        if visualize:
            _ = sns.lmplot(data=tee_shots_df,
                           x='distanceLeft',
                           y='shotsRemaining',
                           hue='toSurface',
                           lowess=True)
            plt.title(
                'Expected Shots For Distance Left Grouped By Surface Lowess Model'
            )
            plt.show()
        return tee_shots_df

    # @staticmethod
    # def getSGMeasure(df, sg_measure, starting_col, shots_remain_col, add_stroke):
    #     df['SG{}Over{}'.format(sg_measure, starting_col)] = df[starting_col] - df[shots_remain_col] - add_stroke
    #     return df

    # @staticmethod
    # def getSGReTee(df, sg_measure, starting_col):
    #     df['SG{}Over{}'.format(sg_measure, starting_col)] = -1
    #     return df

    def __init__(self, mongo_obj, tournament_df, raw_sg_df, distances_df=None):
        self._sg_df_dict = {}
        self._logger = MyLogger('sgHandler', 'Analysis/logs/sgHandler.log',
                                logging.INFO).getLogger()
        self._tournament_df = tournament_df
        self._raw_sg_df = raw_sg_df
        self._mongo_obj = mongo_obj
        self._distances_df = distances_df

    def __repr__(self):
        return 'SG DF Dictionary has keys {}\n'.format(self._sg_df_dict.keys())

    def applySGLogicToGroups(self, visualize=False):
        for name, group in sgHandler.grouping_list:
            self._logger.info('Creating SG Stats for group {}'.format(name))
            self._sg_df_dict[name] = {}
            if 'Hole' in name:
                self._sg_df_dict[name]['Total'] = self.getSGOverall(
                    name, group, visualize)
            self._sg_df_dict[name]['Total'] = self.getSGOverall(
                name, group, visualize)

    def getSGOverall(self, name, group_by_cols, visualize):
        self._logger.info('Getting SG Overall {} Stats'.format(name))
        relevant_cols = [
            'pgaYear', 'courseID', 'holeNum', 'roundNum', 'holeAvg',
            'playerScore'
        ]

        sg_tot_df = self._tournament_df.loc[self._tournament_df['shot_id'] ==
                                            1, relevant_cols].copy()
        # sg_tot_df['SGTotOverHoleAvg'] = sg_tot_df['holeAvg'] - sg_tot_df['playerScore']
        self._logger.info(
            'Getting SG Tot For Grouping by {}'.format(group_by_cols))
        sg_tot_df = sgHandler.getGroupAveragesAndSGOverAvg(
            sg_tot_df, name, group_by_cols, 'holeAvg', 'Tot', 'playerScore')
        sg_tot_df = sgHandler.getGroupSTDofSGOverAvg(sg_tot_df, name,
                                                     group_by_cols, 'holeAvg',
                                                     'Tot')
        if visualize:
            _ = sns.histplot(data=sg_tot_df,
                             x='SGTotOver_holeAvg'.format(name),
                             kde=True,
                             hue='holeNum',
                             binwidth=.25,
                             kde_kws={'bw_adjust': 4})
            plt.show()
            _ = sns.histplot(data=sg_tot_df,
                             x='NumSTDFromSGTotOver_holeAvg'.format(name),
                             kde=True,
                             hue='holeNum',
                             binwidth=.25,
                             kde_kws={'bw_adjust': 4})
            plt.show()

        return sg_tot_df

    def sumSGTotalDFs(self, sg_tot_df):
        self._sg_df_dict['Total']['SumByRound'] = sg_tot_df.groupby(['playerID', 'pgaYear', 'roundNum']).sum(). \
            reset_index()
        sg_cols = [col for col in sg_tot_df if 'SG' in col]
        self._sg_df_dict['Total']['RawSGMatch'] = sg_tot_df.groupby(['playerID', 'pgaYear']). \
            apply(lambda x: x[sg_cols].sum() / x['roundNum'].nunique()).reset_index()

    def getSGTee(self, visualize=False):
        relevant_cols = [
            'playerID', 'firstName', 'lastName', 'pgaYear', 'courseID',
            'holeNum', 'roundNum', 'par', 'startDistance',
            'startDistance10ydBin', 'distanceLeft', 'distanceLeft5ydBin',
            'distanceLeft1ydBin', 'distanceLeft1ftBin', 'toSurface',
            'shotsRemaining', 'isReTee'
        ]
        tee_shots_df = self._tournament_df[(
            self._tournament_df['shotType'] == 'TEE')][relevant_cols].copy()
        tee_shots_df = sgHandler.getStartingExpectedShots(
            tee_shots_df, visualize)
        tee_shots_df = sgHandler.getRemainingExpectedShots(
            tee_shots_df, visualize)
        tee_shots_df.drop(columns='distance/shots', inplace=True)
        tee_shots_df['SGTeeByLowess'] = tee_shots_df['lmExpectedShotsStartingGrouped'] - \
                                        tee_shots_df['lmExpectedShotsRemainingBySurface'] - 1
        tee_shots_df['SGTeeByLowess'].fillna(-2, inplace=True)
        if visualize:
            _ = sns.histplot(data=tee_shots_df,
                             x='SGTeeByLowess',
                             kde=True,
                             hue='holeNum',
                             binwidth=.25,
                             kde_kws={'bw_adjust': 4})
            plt.show()
        self._sg_df_dict['Tee'] = {}
        tee_shots_df['NumSTDFromSGTeeByLowess'] = abs(
            tee_shots_df['SGTeeByLowess'] / tee_shots_df.groupby(
                ['pgaYear', 'roundNum'])['SGTeeByLowess'].transform('std'))
        tee_shots_df['AvgSGTeeByLowess'] = tee_shots_df.groupby(['pgaYear', 'roundNum'])['SGTeeByLowess']. \
            transform('mean')
        self._sg_df_dict['Tee']['RoundBased'] = tee_shots_df
        self._sg_df_dict['Tee']['SumByRound'] = tee_shots_df.groupby(['playerID', 'pgaYear', 'roundNum']).sum(). \
            reset_index()
        sg_cols = [col for col in tee_shots_df if 'SG' in col]
        self._sg_df_dict['Tee']['RawSGMatch'] = tee_shots_df.groupby(['playerID', 'pgaYear']). \
            apply(lambda x: x[sg_cols].sum() / x['roundNum'].nunique()).reset_index()

    def getSG_DF_Dict(self):
        return self._sg_df_dict
Esempio n. 18
0
class SGScraper:
    """Given a tournament and year, this scrapes pgatour.com tournament result
     page to create json files containing data on tournament info and player course_hole by course_hole shots"""
    def __init__(self):
        """Initialize SG Scraper"""
        self._sg_url = 'https://datagolf.com/historic-event-data'

        # create place holder dictionaries for data once scraped
        self._tournament_sg_col = []

        # all I/O done in tournaments/'pga_year'_'tournament_name' directory
        self._file_handler = 'tournaments/SG/logs/sg_scape.log'

        # initialize logger
        self._logger = MyLogger(self.__class__.__name__, self._file_handler,
                                logging.INFO, 'w').getLogger()

        # initialize driver
        self.web_driver = WebDriver(self._logger)
        self.year_options = None

    def __repr__(self):
        """Print Scraper Class with scraped status"""
        return self.__class__.__name__

    def _sgStatsToDict(self, year_name, tournament_name, sg_stats):
        self._logger.info('Getting SG stats for {} during {} {}'.format(
            sg_stats[0], year_name, tournament_name))
        self._tournament_sg_col.append({
            'pgaYear': year_name,
            'tournamentName': tournament_name,
            'playerName': sg_stats[0],
            'sgPUTT': sg_stats[1],
            'sgARG': sg_stats[2],
            'sgAPP': sg_stats[3],
            'sgOTT': sg_stats[4],
            'sgT2G': sg_stats[5],
            'sgTOT': sg_stats[6]
        })

    def runScrape(self, years_to_scrape):
        """"""
        self._logger.info('Go to SG Scrape url {}\n'.format(self._sg_url))
        self.web_driver.goToURL(self._sg_url)
        driver = self.web_driver.getDriver()

        try:
            tournament_selector = Select(driver.find_element_by_id('dropdown'))
            num_options = len(tournament_selector.options)
            for idx in range(num_options):
                tournament_selector.select_by_index(idx)
                tournament_name = tournament_selector.first_selected_option.text
                _ = self.web_driver.webDriverWait(
                    driver,
                    wait_for_text_to_match((By.CLASS_NAME, 'subtitle'),
                                           r'\d+ {}'.format(tournament_name)),
                    'Error waiting for tournament to load\n{}')
                self.year_options = driver.find_elements_by_class_name(
                    'yearoptions')
                for year in reversed(self.year_options):
                    year_name = year.text
                    if year_name not in years_to_scrape:
                        continue

                    self._logger.info('\nRunning SG Scrape for {} {}'.format(
                        year_name, tournament_name))
                    year.click()
                    sg_table = driver.find_element_by_class_name('table')
                    data_rows = sg_table.find_elements_by_class_name('datarow')
                    for row in data_rows:
                        sg_stats = row.text.split('\n')
                        if sg_stats[3] == '--':
                            self._logger.info('No SG stats for {} {}'.format(
                                year_name, tournament_name))
                            break
                        self._sgStatsToDict(
                            year_name, tournament_name,
                            [sg_stats[i] for i in (1, 3, 4, 5, 6, 7, 8)])
            return True
        except Exception as e:
            self._logger.error('Failed running SG scrape due to {}'.format(e),
                               exc_info=True)
            return False

    def getSGCollection(self):
        return self._tournament_sg_col