Ejemplo n.º 1
0
    def __init__(self, **kwargs):

        logging.getLogger(__name__).addHandler(logging.NullHandler())

        if 'game_keys' in kwargs:
            self.game_keys = kwargs['game_keys']
        else:
            self.game_keys = ['game_total', 'away_team', 'away_total', 'home_team', 'home_total']

        if 'team_names' in kwargs:
            self.team_names = kwargs['team_names']
        else:
            self.team_names = NBATeamNames()
Ejemplo n.º 2
0
    def __init__(self, nbadotcom_games = {}, omit = None):
        '''
        Args:
            nbadotcom_games (dict): key-value pair of gamecode and nbacom_game_id
            omit (list): fields to omit from nbastuffer files
        '''

        logging.getLogger(__name__).addHandler(logging.NullHandler())
        logging.addHandler(logging.NullHandler())

        self.names = NBATeamNames()
        self.nbadotcom_games = nbadotcom_games

        if omit:
            self.omit = omit
        else:
            self.omit = ['teams', 'f', 'moneyline', 'moneyline_', 'movements', 'opening_odds', 'to to']
Ejemplo n.º 3
0
class NBATeams_test(unittest.TestCase):
    def setUp(self):
        self.teams = NBATeamNames()

    def test_city_to_code(self):
        self.assertEqual(self.teams.city_to_code('Chicago'), 'CHI')

    def test_long_to_code(self):
        self.assertEqual(self.teams.long_to_code('Chicago Bulls'), 'CHI')

    def test_id_to_code(self):
        self.assertEqual(self.teams.id_to_code('1610612741'), 'CHI')

    def test_code_to_id(self):
        self.assertEqual(self.teams.code_to_id('CHI'), '1610612741')

    def test_code_to_long(self):
        self.assertEqual(self.teams.code_to_long('CHI'), 'Chicago Bulls')
Ejemplo n.º 4
0
    def __init__(self, **kwargs):

        self.logger = logging.getLogger(__name__)

        if 'game_keys' in kwargs:
            self.game_keys = kwargs['game_keys']
        else:
            self.game_keys = ['game_total', 'away_team', 'away_total', 'home_team', 'home_total']

        if 'team_names' in kwargs:
            self.team_names = kwargs['team_names']
        else:
            self.team_names = NBATeamNames()
Ejemplo n.º 5
0
class PinnacleNBAParser(object):
    '''
    Takes xml from scraper, returns list of game dictionaries

    Usage:
        p = PinnacleNBAParser()

        with open('/home/sansbacon/1.xml', 'r') as infile:
            xml = infile.read()

            - OR -

        s = PinnacleNBAScraper()
        xml = s.odds()
        games = p.odds(xml)
        p.to_csv(games, date)
        p.to_tsv(games, date)
        p.to_fw(games, date)

    '''

    def __init__(self, **kwargs):

        logging.getLogger(__name__).addHandler(logging.NullHandler())

        if 'game_keys' in kwargs:
            self.game_keys = kwargs['game_keys']
        else:
            self.game_keys = ['game_total', 'away_team', 'away_total', 'home_team', 'home_total']

        if 'team_names' in kwargs:
            self.team_names = kwargs['team_names']
        else:
            self.team_names = NBATeamNames()

    def _convert_from_utc(self, datestr):
        '''
        http://stackoverflow.com/questions/4770297/python-convert-utc-datetime-string-to-local-datetime
        '''
        from_zone = tz.tzutc()
        to_zone = tz.tzlocal()
        utc = datetime.strptime(datestr, '%Y-%m-%d %H:%M')
        utc = utc.replace(tzinfo=from_zone)
        central = utc.astimezone(to_zone)
        return central

    def _game_code(self, game, date_local):
        return '{0}/{1}{2}'.format(datetime.strftime(date_local, '%Y%m%d'), game['away_team'], game['home_team'])

    def _implied_total(self, game_total, spread):
        '''
        Takes game total and spread and returns implied total based on those values
        '''
        return (game_total/float(2)) - (spread/float(2))

    def _multikeysort(self, items, columns):
        '''
        from https://wiki.python.org/moin/SortingListsOfDictionaries
        '''
        comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns]

        def comparer(left, right):
            for fn, mult in comparers:
                result = cmp(fn(left), fn(right))
                if result:
                    return mult * result
            else:
                return 0

        return sorted(items, cmp=comparer)

    def _parse_event(self, event):
        '''
        Handles the heavy lifting parsing each event node
        '''

        game = {}

        # game information
        date_gmt = event.find('event_datetimegmt').get_text()
        if date_gmt:
            game['date_gmt'] = date_gmt
            date_local = self._convert_from_utc(date_gmt)
            game['date_local'] = datetime.strftime(date_local, '%Y-%m-%d')
            game['display_date'] = datetime.strftime(date_local, '%m-%d %H:%M')
            game['start'] = datetime.strftime(date_local, '%H:%M')

        gamenumber = event.find('gamenumber').get_text()

        if gamenumber:
            game['gamenumber'] = gamenumber

        # teams, use long_to_short to get 3 letter team codes
        away, home = event.findAll('participant')
        if away and home:
            game['away_team'] = away.find('participant_name').get_text()
            short_name = self.team_names.long_to_short(game['away_team'])

            if short_name:
                game['away_team'] = short_name

            game['home_team'] = home.find('participant_name').get_text()
            game['home_team_number'] = home.find('rotnum').get_text()
            short_name = self.team_names.long_to_short(game['home_team'])

            if short_name:
                game['home_team'] = short_name

            game['nbacom_gamecode'] = self._game_code(game, date_local)

        # o/u and point spread are found in the period node
        # contains spread and total nodes
        period = event.find('period')

        if period:
            logging.debug('found period for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
            spread = period.find('spread')

            if spread:
                logging.debug('found spread for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                game['spread_away'] = spread.find('spread_visiting').get_text()
                game['spread_home'] = spread.find('spread_home').get_text()
                logging.debug('spread away, home: {0}, {1}' \
                              .format(game.get('spread_away', '<missing>'),game.get('spread_home', '<missing>')))

                # totals, including calculated implied totals
                total = period.find('total')

                if total:
                    logging.debug('found total for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                    game_total = total.find('total_points').get_text()

                    if game_total:
                        logging.debug('found game_total for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                        game['game_total'] = game_total
                        game['home_total'] = self._implied_total(float(game_total), float(game['spread_home']))
                        game['away_total'] = self._implied_total(float(game_total), float(game['spread_away']))
                        logging.debug('away, home: {0}, {1}'.format \
                           (game.get('away_total', '<missing>'), game.get('home_total', '<missing>')))

        else:
            logging.debug('did not find period')

        return game

    def _sort_games(self, games, games_no_totals):
        '''
        Splits games into those with totals and no totals
        Sorts list of dictionaries descending on game_total, then adds games with no totals to list
        '''

        games = self._multikeysort(games, ['-game_total'])
        #logging.debug('games with totals: {0}'.format(pprint.pformat(games)))

        if games_no_totals:
            #logging.debug('games no totals: {0}'.format(pprint.pformat(games_no_totals)))
            games += games_no_totals

        #logging.debug('all games: {0}'.format(pprint.pformat(games)))
        return games

    def odds(self, xml, game_date_cst):
        '''
        Takes xml in, returns sorted and filtered list of game dictionaries:
        game_date_cst in YYYY-MM-DD format
        '''

        # TODO:
            # need to filter based on <league>NBA</league> and <league>Live NBA</league>
        
        games = []
        games_no_totals = []

        # get the list of events
        soup = BeautifulSoup(xml)
        events = soup.findAll('event')
        logging.debug('there are {0} events'.format(len(events)))

        # last item is unwanted - All NFL Games Will Have Second Half Wagering
        for event in events[:-1]:
            game = self._parse_event(event)
            logging.debug(pprint.pformat(game))
            date_local = game.get('date_local', None)

            # filter if specify game date
            if game_date_cst:

                if date_local == game_date_cst:
                    game_total = game.get('game_total', None)

                    if game_total:
                        games.append(game)

                    else:
                        games_no_totals.append(game)

            else:
                game_total = game.get('game_total', None)

                if game_total:
                    games.append(game)

                else:
                    games_no_totals.append(game)

        # sort results and then return
        return self._sort_games(games, games_no_totals)

    def to_csv (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs csv
        '''

        lines = []
        headers = ['date'] + self.game_keys
        lines.append(', '.join(headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            lines.append(', '.join(values))

        return '\n'.join(lines)

    def to_fw (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs fixed-width table
        '''

        lines = []
        headers = ['date'] + self.game_keys
        formatstr = ' '.join(['{:12s}'] * len(headers))       
        lines.append(formatstr.format(*headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            formatstr = ' '.join(['{:12s}'] * len(values))       
            lines.append(formatstr.format(*values))

        return '\n'.join(lines)

    def to_tsv (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs csv
        '''

        lines = []
        headers = ['date'] + self.game_keys
        lines.append('\t'.join(headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            lines.append('\t'.join(values))

        return '\n'.join(lines)
Ejemplo n.º 6
0
 def setUp(self):
     self.teams = NBATeamNames()
Ejemplo n.º 7
0
class NBAStufferParser(object):

    '''
    Parses xls or csv file of NBA game info from nbastuffer.com into game dictionaries

    Example:
        p = NBAStufferParser()
        fn='stuffer.xlsx'
        wb = xlrd.open_workbook(fn)
        sheet = wb.sheet_by_index(0)
        if sheet:
            games = p.xlsx_game_pairs(sheet, p.xlsx_headers(sheet))
    
    '''

    def __init__(self, nbadotcom_games = {}, omit = None):
        '''
        Args:
            nbadotcom_games (dict): key-value pair of gamecode and nbacom_game_id
            omit (list): fields to omit from nbastuffer files
        '''

        logging.getLogger(__name__).addHandler(logging.NullHandler())
        logging.addHandler(logging.NullHandler())

        self.names = NBATeamNames()
        self.nbadotcom_games = nbadotcom_games

        if omit:
            self.omit = omit
        else:
            self.omit = ['teams', 'f', 'moneyline', 'moneyline_', 'movements', 'opening_odds', 'to to']

    def _fix_headers(self, headers_):
        '''
        Standardize with field names used by nba.com

        Args:
            headers_ (list):

        Returns:
            fixed (list):
        '''

        fixed = []

        convert = {
            'date': 'gamedate',
            'team_abbreviation': 'team_code',
            'fg': 'fgm',
            'ft': 'ftm',
            'dr': 'dreb',
            'or': 'oreb',
            'tot': 'reb',
            'a': 'ast',
            'st': 'stl',
            'bl': 'blk',
            'to': 'tov',
            '3p': 'fg3m',
            '3pa': 'fg3a',
            'spread': 'opening_spread',
            'total': 'opening_total'
        }

        for header in headers_:
            converted = convert.get(header, None)

            if converted:
                fixed.append(converted)

            else:
                fixed.append(header)

        return fixed

    def _fix_new_orleans(self, gamecode, season):
        '''
        Inconsistent naming of New orleans pelicans / hornets / oklahoma city (katrina year)
        This produces the correct 3-letter code to match up to nba.com gamecodes

        Args:
            gamecode (str): 20151031/CHINOP
            season (str): 2015-16

        Returns:
            gamecode (str):
        '''

        if season in ['2007', '2008', '2009', '2010', '2011', '2012']:
            return gamecode.replace('NOP', 'NOH')

        elif season in ['2005', '2006']:
            return gamecode.replace('NOP', 'NOK')

        else:
            return gamecode

    def _fix_starters(self, *args):
        '''
        Removes strange unicode characters in some of the player (starter) fields

        Args:
            variable length list of strings

        Returns:
            fixed (list): strings with unicode characters removed
        '''

        fixed = []
        
        for team in args:
            for k in team:
                if 'starter' in k:
                    team[k] = team[k].replace('\xc2\xa0', '')

            fixed.append(team)
            
        return fixed
        
    def _gameid(self, gamecode, dataset='Regular'):
        '''
        Takes gamecode and returns game_id used by nba.com

        Args:
            gamecode(str): format 20151027/CLECHI.
            dataset (str): name of dataset (regular season or playoff)

        Returns:
            gameid (str): 8 or 10-digit gamecode

        '''
        game = self.nbadotcom_games.get(gamecode)
        gameid = None

        if game:
            gameid = game.get('game_id', None)
            logging.debug('_gameid returns {0}'.format(gameid))

        else:
            # right now, don't have playoffs in games database, no need to print all of those errors
            if 'Regular' in dataset:
                logging.warning('_gameid: could not find id for gamecode {0}'.format(gamecode))
    
            else:
                logging.debug('_gameid: could not find id for playoff gamecode {0}'.format(gamecode))
            
        return gameid

    def _gamecode(self, away, home):
        '''
        Returns gamecode based on game_pair from nbastuffer dataset
        Game_pair is a list with two elements: first is dictionary of away team, second is dictionary of home team

        Args:
            away (dict): top row in game_pair
            home (dict): bottom row in game_pair

        Returns:
             gamecode (str): in format 20141031/CHICLE

        '''

        gamecode = None
        gamedate = away.get('gamedate', None)
        
        away_team = away.get('team_code', None)
        home_team = home.get('team_code', None)

        if gamedate and away_team and home_team:
            match = re.search(r'(\d+)\/(\d+)\/(\d+)', gamedate)
            
            if match:
                if len(match.group(3)) > 2:
                    gamecode = '{0}{1}{2}/{3}{4}'.format(match.group(3), match.group(1), match.group(2), away_team, home_team)
                    #logging.debug('gamecode: {0}'.format(gamecode))
                else:
                    gamecode = '20{0}{1}{2}/{3}{4}'.format(match.group(3), match.group(1), match.group(2), away_team, home_team)
                    #logging.debug('gamecode: {0}'.format(gamecode))

            else:
                gamecode = '{0}/{1}{2}'.format(gamedate, away_team, home_team)

        # new orleans pelicans / hornets / oklahoma city (katrina year) fix
        if gamecode and 'NOP' in gamecode:
            season = away.get('dataset', 'XXXX')[0:4]
            gamecode = self._fix_new_orleans(gamecode, season)

        logging.debug('_gamecode returns {0}'.format(gamecode))
        return gamecode

    def _get_closing(self, team1, team2, rowidx=0):
        '''
        Takes 2 team dictionaries, extracts cell with closing odds / closing (could be line or spread) or sets to None

        Args:
            team1 (dict):
            team2 (dict):
            rowidx (int):

        Returns:
            team1_odds (str):
            team2_odds (str):

        Examples:
            Various formats:
                Sometimes total like 198
                Sometimes + odds like 7
                Sometimes - odds like -7
                Sometimes a hybrid like -5.5 -05
                Sometimes includes PK (which is Pick'Em, 0 spread)
                Sometimes looks like -1.5-05
        '''
        
        team1_odds = team1.get('closing_odds', None)
        team2_odds = team2.get('closing_odds', None)
        
        # odds are stored under 'closing_odds' and 'closing' depending on the year
        if team1_odds == None or team1_odds == '':
            team1_odds = team1.get('closing', None)

            # if can't get closing, rely on opening
            if team1_odds == None or team1_odds == '':
                team1_odds = team1.get('spread', None)
                team2_odds = team1.get('total', None)
                logging.error('have to rely on opening odds: {0}, {1}: {2}'.format(team2.get('gamedate', 'Gamedate N/A'), team1.get('teams', 'Team N/A'), team2.get('teams', 'Team N/A')))

        # odds are stored under 'closing_odds' and 'closing' depending on the year
        if team2_odds == None or team1_odds == '':
            team2_odds = team2.get('closing', None)

            if team2_odds == None or team1_odds == '':
                team2_odds = team2.get('total', None)
                team1_odds = team1.get('spread', None)
                logging.error('have to rely on opening odds: {0}, {1}: {2}'.format(team2.get('gamedate', 'Gamedate N/A'), team1.get('teams', 'Team N/A'), team2.get('teams', 'Team N/A')))

        # if can't obtain anything, then skip further processing on odds
        if team1_odds == None or team1_odds == '':
            logging.error('error _get_closing: line %d | team1_odds: %s  team2_odds %s' % (rowidx, team1_odds, team2_odds))

        elif team2_odds == None or team1_odds == '':
            logging.error('error _get_closing: line %d | team1_odds: %s  team2_odds %s' % (rowidx, team1_odds, team2_odds))

        else:
            '''
            Takes 2 team dictionaries, extracts cell with closing odds / closing (could be line or spread) or sets to None
            Various formats:
                Sometimes a hybrid like -5.5 -05
                Sometimes includes PK (which is Pick'Em, 0 spread)
                Sometimes looks like -1.5-05
            '''

            # remove PK and set to zero
            if 'PK' in team1_odds:
                team1_odds = 0

            # otherwise strip out multiple odds if present
            else:
                match = re.search(r'([-]?\d+\.?\d?)\s?.*?', team1_odds)

                if match:
                    team1_odds = match.group(1)

            # remove PK and set to zero
            if 'PK' in team2_odds:
                team2_odds = 0

            # otherwise strip out multiple odds if present
            else:
                match = re.search(r'([-]?\d+\.?\d?)\s?.*?', team2_odds)

                if match:
                    team2_odds = match.group(1)

        logging.debug('team odds: {0}, {1}: {2}, {3}'.format(team1.get('teams', 'Team N/A'), team2.get('teams', 'Team N/A'), team1_odds, team2_odds))
               
        return team1_odds, team2_odds

    def _implied_total(self, game_total, spread):
        '''
        Takes game total and spread and returns implied total based on those values

        Args:
            game_total (float): something like 201.5
            spread (float): something like -1.5

        Returns:
            implied_total (float): something like 100.25
        '''

        try:
            return (float(game_total)/float(2)) - (float(spread)/float(2))

        except TypeError as e:
            logging.error('implied total error: {0}'.format(e.message))
            return None
                
    def _is_total_or_spread(self, val1, val2):
        '''
        Tests if it is a game total or a point spread; former is always larger

        Args:
            val1 (float):
            val2 (float):

        Returns:
            val_type (str): 'total' or 'spread'
        '''

        try:
            if float(val1) > float(val2):
                return 'total'
            else:
                return 'spread'

        except:
            logging.error('{0} or {1} is not a number'.format(val1, val2))
            return None

    def _point_spread(self, odds):
        '''
        Takes point spread, can be negative or positive, assumes that spread is for team1

        Args:
            odds (float):

        Returns:
            team1_spread (float):
            team2_spread (float):
        '''

        try:
            return float(odds), 0 - float(odds)

        except Exception as e:
            logging.exception(e.message)
            return None, None

    def _rest(self, team):
        '''
        `days_last_game` tinyint not null,
        `back_to_back` bool not null,
        `back_to_back_to_back` bool not null,
        `three_in_four` bool not null,
        `four_in_five` bool not null,
        3+, B2B, B2B2B, 3IN4, 3IN4-B2B, 4IN5, 4IN5-B2B

        '''

        team['days_last_game'] = None
        rest_days = team.get('rest_days', None)
        
        if rest_days is not None:

            # B2B and B2B2B
            if 'B2B' in rest_days:
                team['back_to_back'] = True
                team['days_last_game'] = 0
            else:
                team['back_to_back'] = False

            if 'B2B2B' in rest_days:
                team['back_to_back_to_back'] = True
            else:
                team['back_to_back_to_back'] = False

            # 3IN4
            if '3IN4' in rest_days:
                team['three_in_four'] = True
                team['days_last_game'] = 1
            else:
                team['three_in_four'] = False

            # 4IN5
            if '4IN5' in rest_days:
                team['three_in_four'] = True
                team['four_in_five'] = True
                team['days_last_game'] = 0

            else:
                team['four_in_five'] = False

            if re.match(r'\d{1}', rest_days):
                team['days_last_game'] = rest_days[0]

        return team

    def _team_abbrev(self, team_name):
        '''
        NBAStuffer uses the city name only, not the team name, which is annoying b/c New Orleans / Charlotte multiple teams over time
        '''
        
        return self.names.city_to_code(team_name)

    def _total_and_spread(self, team1, team2, rowidx):
        '''
        Spreadsheet/csv has odds in an inconsistent format, so have to wrangle to make it uniform
        Returns game_ou, away_spread, home_spread
        '''

        # team1_odds, team2_odds are in -8 195 format (depending on whether total or spread
        # type will be "total" or "spread"
        team1_odds, team2_odds = self._get_closing(team1, team2, rowidx)
        logging.info('team1 odds: {}'.format(team1_odds))
        logging.info('team2 odds: {}'.format(team2_odds))
        team1_type = self._is_total_or_spread(team1_odds, team2_odds)

        game_ou = None
        away_spread = None
        home_spread = None

        if team1_odds is not None and team2_odds is not None:
            # if team1_odds is total, then team2_odds is spread
            # set the game_ou and then calculate spreads
            if team1_type == 'total':       
                game_ou = team1_odds
                home_spread, away_spread = self._point_spread(team2_odds)
                
            # if team1_odds is a spread,
            # calculate spreads for both teams and then set the game_ou
            elif team1_type == 'spread':
                away_spread, home_spread = self._point_spread(team1_odds)
                game_ou = team2_odds
                
            else:
                logging.error('row {0}: not spread or line - {1} {2}'.format(rowidx, team1_odds, team2_odds))

        else:
            logging.error('row {0}: not spread or line - {1} {2}'.format(rowidx, team1_odds, team2_odds))
       
        return game_ou, away_spread, home_spread

    def game_pairs(self, rows, headers):
        '''
        Goes through data rows two at a time (grouped by home/away team in same game)
        Returns list of (list of 2 dictionaries (home and away info) that represents one game)

        Args:
            rows (list): lines from csv file
            headers (list): headers for each row

        Returns:
            gp (list): each game pair is a list of 2 teams that played in game
        '''

        gp = []
        
        for rowidx in range(0,len(rows),2):
            # merge all of the cells in the row with the headers
            # proceed in pairs because 2 rows make for one game

            team1 = dict(list(zip(headers, rows[rowidx].split(','))))
            team2 = dict(list(zip(headers, rows[rowidx+1].split(','))))
            team1, team2 = self._fix_starters(team1, team2)

            if team1 and team2:
                # convert team city to 3-letter code
                # add codes to both teams in game_pair
                team1['team_code'] = self._team_abbrev(team1.get('teams', None))
                team2['team_code'] = self._team_abbrev(team2.get('teams', None))
                team1['opponent_team_code'] = team2['team_code']
                team2['opponent_team_code'] = team1['team_code']
                team1['away_team'] = team1['team_code']
                team1['home_team'] = team2['team_code']
                team2['away_team'] = team1['team_code']
                team2['home_team'] = team2['team_code']

                # team ids
                team1['team_id'] = self.names.code_to_id(team1['team_code'])
                team2['team_id'] = self.names.code_to_id(team2['team_code'])
                team1['opponent_team_id'] = self.names.code_to_id(team1['opponent_team_code'])
                team2['opponent_team_id'] = self.names.code_to_id(team2['opponent_team_code'])
                team1['away_team_id'] = self.names.code_to_id(team1['away_team'])
                team2['away_team_id'] = self.names.code_to_id(team2['away_team'])
                team1['home_team_id'] = self.names.code_to_id(team1['home_team'])
                team2['home_team_id'] = self.names.code_to_id(team2['home_team'])

                # opponent points
                team1['opponent_points'] = team2['pts']
                team2['opponent_points'] = team1['pts']
                
                # spread and totals will be 196, -8 or -8, 196
                # add game_ou, away_spread, home_spread to both teams in game_pair
                game_ou, away_spread, home_spread = self._total_and_spread(team1, team2, rowidx)
                team1['game_ou'] = game_ou
                team1['away_spread'] = away_spread
                team1['home_spread'] = home_spread

                away_implied_total = self._implied_total(game_ou, away_spread)
                home_implied_total = self._implied_total(game_ou, home_spread)

                team1['away_implied_total'] = away_implied_total
                team1['home_implied_total'] = home_implied_total
                team2['game_ou'] = game_ou
                team2['away_spread'] = away_spread
                team2['home_spread'] = home_spread
                team2['away_implied_total'] = away_implied_total
                team2['home_implied_total'] = home_implied_total

                # gamecode is in 20151030/DETCHI
                # gameid is nbadotcom identifier for games
                gamecode = self._gamecode(team1, team2)
                game_id = self._gameid(gamecode, team1.get('dataset', 'Regular'))
                team1['gamecode'] = gamecode
                team2['gamecode'] = gamecode
                team1['game_id'] = game_id
                team2['game_id'] = game_id

                # rest
                team1 = self._rest(team1)
                team2 = self._rest(team2)

                # fix closing
                if 'closing' in team1:
                    team1['closing_odds'] = team1['closing']
                    team1.pop('closing')

                if 'closing' in team2:
                    team2['closing_odds'] = team2['closing']
                    team2.pop('closing')

                # fix gamedate
                if 'gamedate' in team1:
                    team1['game_date'] = team1['gamedate']
                    team1.pop('gamedate')

                if 'gamedate' in team2:
                    team2['game_date'] = team2['gamedate']
                    team2.pop('gamedate')

                # regular season
                if 'Regular' in team1['dataset']:
                    team1['is_regular_season'] = True
                    team2['is_regular_season'] = True

                # overtime
                if team1.get('ot1'):
                    team1['has_ot'] = True
                    team2['has_ot'] = True

                # omit some fields - can pass parameter or use defaults
                for field in self.omit:
                    team1.pop(field.lower(), None)
                    team2.pop(field.lower(), None)
                    team1.pop(field.upper(), None)
                    team2.pop(field.upper(), None)

                gp.append([team1, team2])

            else:
                logging.error('could not get team1 or team2')

        return gp

    def headers(self, row):
        '''
        Takes first row of sheet or csv file and returns lowercase column header with no spaces
        '''

        return self._fix_headers([re.sub(r'\s+', '_', c).strip().lower() for c in row])
        
    def xlsx_game_pairs(self, sheet, headers):
        '''
        Goes through data rows two at a time (grouped by home/away team in same game)
        Returns list of (list of 2 dictionaries (home and away info) that represents one game)
        '''
        
        gp = []
        
        for rowidx in range(1,sheet.nrows,2):
            team1 = dict(list(zip(headers, rows[rowidx])))
            team2 = dict(list(zip(headers, rows[rowidx+1])))

            if team1 and team2:
                # convert team city to 3-letter code
                team1['team_code'] = self._team_abbrev(team1.get('teams', None))
                team2['team_code'] = self._team_abbrev(team2.get('teams', None))
                team1['away_team'] = team1['team_code']
                team1['home_team'] = team2['team_code']
                team2['away_team'] = team1['team_code']
                team2['home_team'] = team2['team_code']

                # spread and totals will be 196, -8 or -8, 196
                game_ou, away_spread, home_spread = self._total_and_spread(team1, team2, rowidx)
                team1['game_ou'] = game_ou
                team1['away_spread'] = away_spread
                team1['home_spread'] = home_spread
                team2['game_ou'] = game_ou
                team2['away_spread'] = away_spread
                team2['home_spread'] = home_spread

                # gamecode is in 20151030/DETCHI
                # gameid is nbadotcom identifier for games
                gamecode = self._gamecode(game_pair)
                game_id = self._gameid(gamecode)
                team1['gamecode'] = gamecode
                team2['gamecode'] = gamecode
                team1['game_id'] = game_id
                team2['game_id'] = game_id
                team2['gamecode'] = gamecode

                # rest
                team1 = self._rest(team1)
                team2 = self._rest(team2)

                gp.append([team1, team2])

            else:
                logging.error('%s | row %d: could not get team1 or team2 - %s' % (sheet.name, rowidx))

        return gp

    def xlsx_headers(self, sheet):
        '''
        Takes first row of sheet and returns lowercase column header with no spaces
        Still need to address issue of starting players, format in spreadsheet is off
        '''

        _headers = []

        for colidx in range(0, sheet.ncols):
            header = sheet.cell(0, colidx).value.strip()
            header = re.sub(r'\s+', '_', header)

            if header == '':
                header = 'starting_lineups'

            _headers.append(header.lower())

        return _headers
Ejemplo n.º 8
0
class PinnacleNBAParser(object):
    '''
    Takes xml from scraper, returns list of game dictionaries

    Usage:
        p = PinnacleNBAParser()

        with open('/home/sansbacon/1.xml', 'r') as infile:
            xml = infile.read()

            - OR -

        s = PinnacleNBAScraper()
        xml = s.odds()
        games = p.odds(xml)
        p.to_csv(games, date)
        p.to_tsv(games, date)
        p.to_fw(games, date)

    '''

    def __init__(self, **kwargs):

        self.logger = logging.getLogger(__name__)

        if 'game_keys' in kwargs:
            self.game_keys = kwargs['game_keys']
        else:
            self.game_keys = ['game_total', 'away_team', 'away_total', 'home_team', 'home_total']

        if 'team_names' in kwargs:
            self.team_names = kwargs['team_names']
        else:
            self.team_names = NBATeamNames()

    def _convert_from_utc(self, datestr):
        '''
        http://stackoverflow.com/questions/4770297/python-convert-utc-datetime-string-to-local-datetime
        '''
        from_zone = tz.tzutc()
        to_zone = tz.tzlocal()
        utc = datetime.strptime(datestr, '%Y-%m-%d %H:%M')
        utc = utc.replace(tzinfo=from_zone)
        central = utc.astimezone(to_zone)
        return central

    def _game_code(self, game, date_local):
        return '{0}/{1}{2}'.format(datetime.strftime(date_local, '%Y%m%d'), game['away_team'], game['home_team'])

    def _implied_total(self, game_total, spread):
        '''
        Takes game total and spread and returns implied total based on those values
        '''
        return (game_total/float(2)) - (spread/float(2))

    def _multikeysort(self, items, columns):
        '''
        from https://wiki.python.org/moin/SortingListsOfDictionaries
        '''
        comparers = [ ((itemgetter(col[1:].strip()), -1) if col.startswith('-') else (itemgetter(col.strip()), 1)) for col in columns]

        def comparer(left, right):
            for fn, mult in comparers:
                result = cmp(fn(left), fn(right))
                if result:
                    return mult * result
            else:
                return 0

        return sorted(items, cmp=comparer)

    def _parse_event(self, event):
        '''
        Handles the heavy lifting parsing each event node
        '''

        game = {}

        # game information
        date_gmt = event.find('event_datetimegmt').get_text()
        if date_gmt:
            game['date_gmt'] = date_gmt
            date_local = self._convert_from_utc(date_gmt)
            game['date_local'] = datetime.strftime(date_local, '%Y-%m-%d')
            game['display_date'] = datetime.strftime(date_local, '%m-%d %H:%M')
            game['start'] = datetime.strftime(date_local, '%H:%M')

        gamenumber = event.find('gamenumber').get_text()

        if gamenumber:
            game['gamenumber'] = gamenumber

        # teams, use long_to_short to get 3 letter team codes
        away, home = event.findAll('participant')
        if away and home:
            game['away_team'] = away.find('participant_name').get_text()
            short_name = self.team_names.long_to_short(game['away_team'])

            if short_name:
                game['away_team'] = short_name

            game['home_team'] = home.find('participant_name').get_text()
            game['home_team_number'] = home.find('rotnum').get_text()
            short_name = self.team_names.long_to_short(game['home_team'])

            if short_name:
                game['home_team'] = short_name

            game['nbacom_gamecode'] = self._game_code(game, date_local)

        # o/u and point spread are found in the period node
        # contains spread and total nodes
        period = event.find('period')

        if period:
            logging.debug('found period for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
            spread = period.find('spread')

            if spread:
                logging.debug('found spread for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                game['spread_away'] = spread.find('spread_visiting').get_text()
                game['spread_home'] = spread.find('spread_home').get_text()
                logging.debug('spread away, home: {0}, {1}' \
                              .format(game.get('spread_away', '<missing>'),game.get('spread_home', '<missing>')))

                # totals, including calculated implied totals
                total = period.find('total')

                if total:
                    logging.debug('found total for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                    game_total = total.find('total_points').get_text()

                    if game_total:
                        logging.debug('found game_total for {0}'.format(game.get('nbacom_gamecode', '<missing>')))
                        game['game_total'] = game_total
                        game['home_total'] = self._implied_total(float(game_total), float(game['spread_home']))
                        game['away_total'] = self._implied_total(float(game_total), float(game['spread_away']))
                        logging.debug('away, home: {0}, {1}'.format \
                           (game.get('away_total', '<missing>'), game.get('home_total', '<missing>')))

        else:
            logging.debug('did not find period')

        return game

    def _sort_games(self, games, games_no_totals):
        '''
        Splits games into those with totals and no totals
        Sorts list of dictionaries descending on game_total, then adds games with no totals to list
        '''

        games = self._multikeysort(games, ['-game_total'])
        #logging.debug('games with totals: {0}'.format(pprint.pformat(games)))

        if games_no_totals:
            #logging.debug('games no totals: {0}'.format(pprint.pformat(games_no_totals)))
            games += games_no_totals

        #logging.debug('all games: {0}'.format(pprint.pformat(games)))
        return games

    def odds(self, xml, game_date_cst):
        '''
        Takes xml in, returns sorted and filtered list of game dictionaries:
        game_date_cst in YYYY-MM-DD format
        '''

        # TODO:
            # need to filter based on <league>NBA</league> and <league>Live NBA</league>
        
        games = []
        games_no_totals = []

        # get the list of events
        soup = BeautifulSoup(xml)
        events = soup.findAll('event')
        logging.debug('there are {0} events'.format(len(events)))

        # last item is unwanted - All NFL Games Will Have Second Half Wagering
        for event in events[:-1]:
            game = self._parse_event(event)
            logging.debug(pprint.pformat(game))
            date_local = game.get('date_local', None)

            # filter if specify game date
            if game_date_cst:

                if date_local == game_date_cst:
                    game_total = game.get('game_total', None)

                    if game_total:
                        games.append(game)

                    else:
                        games_no_totals.append(game)

            else:
                game_total = game.get('game_total', None)

                if game_total:
                    games.append(game)

                else:
                    games_no_totals.append(game)

        # sort results and then return
        return self._sort_games(games, games_no_totals)

    def to_csv (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs csv
        '''

        lines = []
        headers = ['date'] + self.game_keys
        lines.append(', '.join(headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            lines.append(', '.join(values))

        return '\n'.join(lines)

    def to_fw (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs fixed-width table
        '''

        lines = []
        headers = ['date'] + self.game_keys
        formatstr = ' '.join(['{:12s}'] * len(headers))       
        lines.append(formatstr.format(*headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            formatstr = ' '.join(['{:12s}'] * len(values))       
            lines.append(formatstr.format(*values))

        return '\n'.join(lines)

    def to_tsv (self, games, datestr):
        '''
        Takes list of game dictionaries, outputs csv
        '''

        lines = []
        headers = ['date'] + self.game_keys
        lines.append('\t'.join(headers))

        for game in games:
            values = [datestr] + [str(game.get(gk, None)) for gk in self.game_keys]
            lines.append('\t'.join(values))

        return '\n'.join(lines)