Beispiel #1
0
def getAll(dt, files=None):
    dbobj = db.Db(config.config['db'])
    step_time = time.time()

    #logging.info("MASTER - starting ETL job - date: %s - database: %s" % (dt, db_credentials['db']))
    # Default set of files/tables to populate
    if not files:
        files = [
            'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn',
            'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn',
            'shotchart_nbacom'
        ]

    # Choose games
    games = chooseGames(dt, dbobj)

    # MAIN ETL PROCESS
    #print "+++ MASTER ETL - files: %s - database: %s" % (str(files), db_credentials['db'])
    # Get source
    gamedata = source.main.go(games, files)

    extract.main.go(gamedata)
    clean.main.go(gamedata, dbobj)
    load.main.go(gamedata, dbobj)

    #afterclean2.main.go(gamedata, dbobj)

    tomorrow = dt + datetime.timedelta(days=1)
    #findgames.go(tomorrow)

    time_elapsed = "Total time: %.2f sec" % (time.time() - step_time)
    print time_elapsed
    logging.info(time_elapsed)
Beispiel #2
0
def cleanOnly(dt):
    dbobj = db.Db(db.dbconn_nba_test)

    games = chooseGames(dt)
    gamedata = getExistingSourceDocs(games)

    clean.main.go(gamedata, dbobj)
Beispiel #3
0
def scrapeDailyAuto(dt, files = None):
    step_time = time.time()

    dbobj = db.Db(config.config['db'])
    config_no_pw = config.config['db'].copy()
    del config_no_pw['passwd']
    


    if not files:
        files = [
            'boxscore_nbacom',
            'boxscore_cbssports',
            'playbyplay_espn',
            'playbyplay_nbacom',
            'shotchart_cbssports',
            'shotchart_espn',
            'shotchart_nbacom'
        ]

    # MAIN ETL PROCESS
    print "+++ MASTER ETL - files: %s - database: %s" % (str(files), config_no_pw)
    logging.info("MASTER - starting ETL job - date: %s - database: %s" % (dt, config_no_pw))

    scrape(dbobj, dt, files)

    tomorrow = dt + datetime.timedelta(days=1)

    time_elapsed = "Total time: %.2f sec" % (time.time() - step_time)
    logging.info(time_elapsed)
Beispiel #4
0
def restartFromClean(dt):
    dbobj = db.Db(db.dbconn_nba)

    games = chooseGames(dt)
    gamedata = getExistingSourceDocs(games)

    clean.main.go(gamedata, dbobj)
    load.main.go(gamedata, dbobj)
Beispiel #5
0
def main():

    dbobj = db.Db(db.dbconn_nba)

    game = dbobj.query_dict("SELECT * FROM game WHERE id = %s" % (2734))[0]
    filename = '%s_playbyplay_nbacom' % (game['abbrev'])
    obj = Clean(filename, game, dbobj)
    data = obj.getPlayByPlayData()
def main():

    dbobj = db.Db(config.dbconn_prod_nba)

    game = dbobj.query_dict("SELECT * FROM game WHERE id = %s" % (4766))[0]
    filename = '%s_boxscore_statsnbacom' % (game['abbrev'])
    obj = Clean(filename, game, dbobj)
    obj.clean()
Beispiel #7
0
def test():
    dbobj = db.Db(db.dbconn_nba)

    files = [f for f in os.listdir(LOGDIR_CLEAN) if 'game_stats' in f]

    obj = Load(dbobj)

    for f in files:
        obj.loadBoxScoreNbaComGameStats(f)
Beispiel #8
0
 def setUp(self):
     self.game_id = 1
     self.pbp_nbacom = pbp_nbacom.Clean(
         'fake_filename', {
             'away_team_id': 1,
             'home_team_id': 2,
             'abbrev': 'game',
             'id': self.game_id,
             'date_played': '2013-01-01'
         }, db.Db(config.dbconn_prod_nba))
Beispiel #9
0
def main():
    dbobj = db.Db(config.dbconn_prod_nba)

    gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4766")[0]
    filename = '%s_boxscore_statsnbacom' % (gamedata['abbrev'])

    #obj = PlayerStatsNbaCom(filename, gamedata, dbobj)
    #obj.resolveNewPlayers()
    obj = Resolve(dbobj)
    obj.resolveStatsNbacom()
def main():

    dbobj = db.Db(config.dbconn_prod_nba)
    files = [
        f for f in os.listdir('../../dump/extract')
        if 'shotchart_statsnbacom' in f
    ]
    f = '2013-11-26_LAL@WAS_shotchart_statsnbacom'
    gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4356")[0]

    obj = Clean(f, gamedata, dbobj)
    obj.clean()
Beispiel #11
0
def main():

    dbobj = db.Db(db.dbconn_prod)
    files = [
        f for f in os.listdir('../../dump/extract') if 'shotchart_nbacom' in f
    ]
    f = '2013-06-20_SA@MIA_shotchart_nbacom'
    gamedata = dbobj.query_dict("SELECT * FROM game WHERE id = 4043")[0]

    obj = CleanWnba(f, gamedata, dbobj)
    shots = obj.getShots()
    print obj.resolveShotCoordinates(shots)
Beispiel #12
0
def aftercleanOnly(dt, files=None):
    dbobj = db.Db(config.config['db'])

    if not files:
        files = [
            'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn',
            'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn',
            'shotchart_nbacom'
        ]
    games = chooseGames(dt, dbobj)
    gamedata = source.main.go(games, files)
    #afterclean2.main.go(gamedata, dbobj)
    afterclean2.gm.main.postCleanGame(gamedata, dbobj)
Beispiel #13
0
def main():

    files = [f for f in os.listdir(LOGDIR_EXTRACT) if '2011-12' in f and 'boxscore' in f]
    f = '2012-01-18_OKC@WAS_boxscore_nbacom'

    gamedata = db.nba_query_dict("SELECT * FROM game where date_played <= '2012-02-10'") 
    dbobj = db.Db(db.dbconn_nba)

    for game in gamedata:
        #print game['abbrev']
        filename = game['abbrev'] + '_boxscore_nbacom'

        obj = CleanBoxScore(filename, game, dbobj)
        result = obj.getGameInfo()
Beispiel #14
0
def mainfunc():
    dbobj = db.Db(db.dbconn_nba_test)

    gamedata = dbobj.query_dict("SELECT * FROM game WHERE date_played <= '2012-02-01' ORDER BY date_played ASC")

    for g in gamedata:
        fnba = g['abbrev'] + '_boxscore_nbacom'
        fcbs = g['abbrev'] + '_shotchart_cbssports'
        if fnba in os.listdir(LOGDIR_EXTRACT):
            print g['abbrev']
            obj = player.PlayerNbaCom(LOGDIR_EXTRACT + fnba, g, dbobj)
            obj.resolveNewPlayers()
            obj = player.PlayerCbsSports(LOGDIR_EXTRACT + fcbs + '_players', g, dbobj)
            obj.resolveNewPlayers()
Beispiel #15
0
def main():
    dbobj = db.Db(db.dbconn_prod)
    game = dbobj.query_dict("""
        SELECT g.*, home_team.city home_team_city, away_team.city away_team_city 
        FROM game g 
            INNER JOIN team home_team on home_team.id = g.home_team_id
            INNER JOIN team away_team on away_team.id = g.away_team_id
        WHERE g.id = 1
            AND g.should_fetch_data = 1
    """)[0]
    filename = game['abbrev'] + '_playbyplay_espn'

    obj = Extract(open(LOGDIR_SOURCE + filename, 'r').read(), filename, game)
    data = obj._getData()
    for row in data:
        print row
Beispiel #16
0
def saveStatsNbaCom():
    dbobj = db.Db(config.dbconn_prod_nba)

    f = open('games.json','r')

    lines = f.readlines()
    for line in lines:
        data = json.loads(line.rstrip())
        if data:
            for row in data:
                sql = """
                    UPDATE game
                    SET national_tv = '%s',
                        statsnbacom_game_id = '%s',
                        gametime = '%s'
                    WHERE
                        nbacom_game_id = '%s'
                """ % (row['NATL_TV_BROADCASTER_ABBREVIATION'], row['GAME_ID'], row['GAME_STATUS_TEXT'], row['GAMECODE'])
                dbobj.query(sql)
Beispiel #17
0
def getCoreData(dt, files=None):
    dbobj = db.Db(config.config['db'])
    step_time = time.time()

    # Default set of files/tables to populate
    if not files:
        files = [
            'boxscore_nbacom', 'boxscore_cbssports', 'playbyplay_espn',
            'playbyplay_nbacom', 'shotchart_cbssports', 'shotchart_espn',
            'shotchart_nbacom'
        ]

    # Choose games
    games = chooseGames(dt, dbobj)

    # MAIN ETL PROCESS
    #print "+++ MASTER ETL - files: %s - database: %s" % (str(files), db_credentials['db'])
    # Get source
    gamedata = source.main.go(games, files)

    extract.main.go(gamedata)
    clean.main.go(gamedata, dbobj)
    load.main.go(gamedata, dbobj)
Beispiel #18
0
def go(dt, files=None):

    conf = config.league['wnba']
    league = 'wnba'

    dbobj = db.Db(conf['db'])
    step_time = time.time()

    config_no_pw = conf['db']
    del config_no_pw['passwd']
    logging.info("MASTER - starting ETL job - date: %s - database: %s" %
                 (dt, config_no_pw))
    # Default set of files/tables to populate
    if not files:
        files = [
            'boxscore_nbacom', 'playbyplay_espn', 'playbyplay_nbacom',
            'shotchart_espn', 'shotchart_wnbacom'
        ]

    # Choose games
    games = _chooseGames(dt, dbobj)

    # MAIN ETL PROCESS
    print "+++ MASTER ETL - files: %s - database: %s" % (str(files),
                                                         conf['db'])

    # Get source
    gamedata = source.main.go(games, files, league)

    extract.main.go(gamedata)
    clean.main.go(gamedata, dbobj)
    load.main.go(gamedata, dbobj)

    tomorrow = dt + datetime.timedelta(days=1)

    time_elapsed = "Total time: %.2f sec" % (time.time() - step_time)
    logging.info(time_elapsed)
Beispiel #19
0
from bs4 import BeautifulSoup
import urllib2
import re
import datetime
from libscrape.config import db
from libscrape.config import config
import time
import json


# A script that scrapes ESPN.com's scoreboard to retrieve ESPN's game_id and store into MySQL db
dbobj = db.Db(config.dbconn_prod_nba)

def getEspnScoreboardDoc(dt): 
    url = 'http://espn.go.com/nba/scoreboard?date=%s' % dt.isoformat().replace('-','')
    response = urllib2.urlopen(url)
    return response.read()


def getEspnData(html, season):
    soup = BeautifulSoup(html)
    links = soup.findAll(href=re.compile("/nba/conversation.*"))
    links = list(set(links))
    
    game_ids = []
    for l in links:
        match = re.search("/nba/conversation\?gameId=(?P<game_id>[0-9]+)$",l['href'])
        
        if match:
            found = match.groupdict()
            game_ids.append(found['game_id'])
Beispiel #20
0
def main():
    dbobj = db.Db(db.dbconn_nba)
    game = dbobj.query_dict("SELECT * FROM game WHERE id = 2366")[0]
    obj = Clean(game['abbrev'] + '_playbyplay_espn', game, dbobj)
    obj._clean()
Beispiel #21
0
from collections import defaultdict
from pulp import *

sys.path.append("../")
from libscrape.config import constants
from libscrape.config import db
from libscrape.config import config

LOGDIR_SOURCE = constants.LOGDIR_SOURCE
LOGDIR_EXTRACT = constants.LOGDIR_EXTRACT

logging.basicConfig(filename='etl.log',
                    level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

dbobj = db.Db(config.config['db'])
logging.info("Starting create table with database: %s" % (dbobj))


def aggregate_possessions_from_dates(cursor,
                                     start_date=datetime.date(2010, 10, 26),
                                     end_date=datetime.date.today(),
                                     max_salary=45):

    # get the stats

    sql = """
  SELECT pgp.player_id, 
         p.full_name, 
         sum(pgp.points_scored) / sum(pgp.offensive_possessions), 
         sum(pgp.points_allowed) / sum(pgp.defensive_possessions),
Beispiel #22
0
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import datetime
from libscrape.config import db
import time

# A script that scrapes ESPN.com's scoreboard to retrieve ESPN's game_id and store into MySQL db
dbobj = db.Db(db.dbconn_nba)


def getScoreboardDoc(dt):
    url = 'http://espn.go.com/nba/scoreboard?date=%s' % dt.isoformat().replace(
        '-', '')
    response = urllib2.urlopen(url)
    return response.read()


def getGameIdsAndTeams(html):
    soup = BeautifulSoup(html)
    links = soup.findAll(href=re.compile("/nba/conversation.*"))
    links = list(set(links))

    game_ids = []
    for l in links:
        match = re.search("/nba/conversation\?gameId=(?P<game_id>[0-9]+)$",
                          l['href'])

        if match:
            found = match.groupdict()
            game_ids.append(found['game_id'])
Beispiel #23
0
def loadOnly(dt):
    dbobj = db.Db(db.dbconn_nba)
    games = chooseGames(dt)
    gamedata = getExistingSourceDocs(games)

    load.main.go(gamedata, dbobj)