def extract(gender, year, div, org, game, path): if not exists(join(path, 'Box Score - All (Parsed).csv')): print("Missing: {}".format(path), file=sys.stderr) return try: box = loadTable(join(path, 'Box Score - All (Parsed).csv')) row = [r for r in box[1:] if r[3] == org][-1] isHome = row[3] == box[-1][3] home = "Home" if isHome else "Away" opponent = box[1 if isHome else -1][3] info = loadTable(join(path, 'Info.csv')) location, attendance = "", "" for (c, v) in zip(info[0], info[1]): if c == "Location:": location = v elif c == "Attendance:": attendance = v row = [gender, year, div] + row[:2] + [org, opponent, home] + row[6:] + [location, attendance] output.append(row) except: print("Exception: {}".format(path), file=sys.stderr)
def loadDateTime(): reDateTime = re.compile(r"^([^ ]+)(?: +(.+))?$") infoTable = loadTable(infofile) m = reDateTime.match(infoTable[1][0]) return [withDefault(m.group(1), default=""), withDefault(m.group(2), default="")]
def loadPlayers(team): filename = os.path.join( os.path.split(datafile)[0], period + " - {}.csv".format(team) ) if not os.path.exists(filename): filename = os.path.join( os.path.split(datafile)[0], "Box Score - {}.csv".format(team) ) boxScoreTable = loadTable(filename) return [(r[0], r[2] != "") for r in boxScoreTable[1:-2]] + [("Team", None)]
def parse(datafile, period, infofile): def computeSeconds(t): m, s = tuple(int(v) for v in t.split(":")) return m * 60 + s def computeTime(t): (x, y) = divmod(t, 60) return "{}:{:02d}".format(x, y) pbpTable = loadTable(datafile) rePBP = re.compile( r"^([0-9]+|[A-Z.`'-?\xbf]*(?:(?:, *| +)[A-Z.`'-?\xbf]*)*)" + r"(?:(?:^| +)(made|missed))?" + r"(?:(?:^| +)((?:[A-Z][a-z]*|[0-9]+)(?: +[A-Z][a-z]*|[0-9]+)*))$", re.U ) teams = [pbpTable[0][1], pbpTable[0][3]] def loadDateTime(): reDateTime = re.compile(r"^([^ ]+)(?: +(.+))?$") infoTable = loadTable(infofile) m = reDateTime.match(infoTable[1][0]) return [withDefault(m.group(1), default=""), withDefault(m.group(2), default="")] def loadPlayers(team): filename = os.path.join( os.path.split(datafile)[0], period + " - {}.csv".format(team) ) if not os.path.exists(filename): filename = os.path.join( os.path.split(datafile)[0], "Box Score - {}.csv".format(team) ) boxScoreTable = loadTable(filename) return [(r[0], r[2] != "") for r in boxScoreTable[1:-2]] + [("Team", None)] dateTime = loadDateTime() players = [loadPlayers(team) for team in teams] def buildTable(): table = [["Date", "Time", "Period", "Time Left", "Score", "Team", "Player", "Status", "Action"]] for r in pbpTable[1:]: if len(r) < 4: break flag = r[1] != "" m = rePBP.match(r[1 if flag else 3]) if m is None: continue table.append( dateTime + \ [period] + \ [ r[0], r[2], teams[0 if flag else 1], best( [p[0] for p in players[0 if flag else 1]], withDefault(m.group(1), "").replace(",", ", ") ), withDefault(m.group(2), ""), withDefault(m.group(3), "") ] ) return table table = buildTable() def getPeriodLength(table): maxTime = table[1][3] n = int(maxTime.split(':')[0]) if 10 < n <= 20: return "20:00" if 5 < n <= 10: return "10:00" elif 0 < n <= 5: return "5:00" periodLength = getPeriodLength(table) def addTimer(table): table[0].append("Shot Clock") lastTime = periodLength for r in table[1:]: time = r[3] if r[-1] == "Turnover": lastTime = time if r[-1] in attacks or r[-1] in defends: diff = min(30, computeSeconds(lastTime) - computeSeconds(time)) r.append(str(30 - diff) if diff > 0 else "") lastTime = time else: r.append("") addTimer(table) def addLineup(table): initPlayers = ( set(p[0] for p in players[0] if p[1]), set(p[0] for p in players[1] if p[1]) ) for r in reversed(table[1:]): player = r[6] i = teams.index(r[5]) if player == "Team": continue if r[-2] == "Enters Game": if player in initPlayers[i]: initPlayers[i].remove(player) elif r[-2] == "Leaves Game": initPlayers[i].add(player) table[0].extend(["Lineup", "Lineup Time"]) lineupTimes = [{}, {}] lastLineup, lastLineupTime = [None, None], [periodLength, periodLength] for r in table[1:]: player = r[6] i = teams.index(r[5]) if player != "Team": if r[-2] == "Leaves Game": if player in initPlayers[i]: initPlayers[i].remove(player) else: initPlayers[i].add(player) lineup = "; ".join(sorted(initPlayers[i])) r.append(lineup) lineupTime = r[3] if lastLineup[i] == None: lastLineup[i] = lineup lineupTimes[i][lastLineup[i]] = lineupTimes[i].setdefault(lastLineup[i], 0) + computeSeconds(lastLineupTime[i]) - computeSeconds(lineupTime) r.append(computeTime(lineupTimes[i].setdefault(lineup, 0))) lastLineup[i], lastLineupTime[i] = lineup, lineupTime addLineup(table) return table
def crawl(sport, year, division, org, game, url, neutral=False): global data data = data.format(sport, year, division) gamename = game.replace('/', '.') def readFlag(flag): if not os.path.exists(os.path.join(data, org, gamename)): os.mkdir(os.path.join(data, org, gamename)) return os.path.exists(os.path.join(data, org, gamename, flag)) def setFlag(flag): with open(os.path.join(data, org, gamename, flag), 'w') as f: pass if neutral and not readFlag(".neutral"): setFlag(".neutral") filename = os.path.join(data, org, gamename, "{}.csv") if not readFlag(".done"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) gs = parseURL(gamelink) sleep(2) gamescore = None gameinfo = None periods = [] teams = [] nextPeriod = 0 for table in gs.select("div.header_menu a"): if ( table["href"] == "#" or not ( table["href"].startswith("/game/box_score") or table["href"].startswith("/game/play_by_play") ) ): continue tablelink = urljoin(domain, table["href"]) print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink)) ts = parseURL(tablelink) if gamescore is None: gamescore = parseTable(ts.select("table:nth-of-type(1)")[0]) dumpTable( gamescore, filename.format("Score") ) if gameinfo is None: gameinfo = transposeTable( parseTable(ts.select("table:nth-of-type(3)")[0]) + parseTable(ts.select("table:nth-of-type(4)")[0]) ) dumpTable( gameinfo, filename.format("Info") ) teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()] periods = [v.text.strip() for v in gamescore[0][1:]] if table["href"].startswith("/game/box_score"): if table.text.strip() == "Box Score": sfilename = filename.format("Box Score - {}") else: sfilename = filename.format(periods[nextPeriod] + " - {}") nextPeriod += 1 dumpTable( parseTable(ts.select("table:nth-of-type(5)")[0], header=1), sfilename.format(teams[0]) ) dumpTable( parseTable(ts.select("table:nth-of-type(6)")[0], header=1), sfilename.format(teams[1]) ) elif table["href"].startswith("/game/play_by_play"): sfilename = filename.format("Play by Play - {}") for (i, period) in enumerate(periods[:-1]): dumpTable( parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0), sfilename.format(period) ) sleep(2) if gamescore == gameinfo == None: raise Exception("Not a game.") setFlag(".done") sleep(2) except Exception as e: print2(colored("Error: ", "red"), e) finally: print2() if not readFlag(".parsed"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) print2("Parsing...") gamescore = loadTable(filename.format("Score")) sfilename = filename.format("Box Score - {}") teams = [gamescore[1][0], gamescore[2][0]] with open(filename.format("Box Score - All (Parsed)"), "w") as af: for team in teams: boxScore = parseBoxScore( sfilename.format(team), filename.format("Info"), team, "All" ) rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af) sfilename = filename.format("Play by Play - {}") periods = gamescore[0][1:] with open(filename.format("Play by Play - All (Parsed)"), "w") as af: for period in periods[:-1]: playByPlay = parsePlayByPlay( sfilename.format(period), period, filename.format("Info") ) rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af) setFlag(".parsed") except Exception as e: print2(colored("Error: ", "red"), e) finally: print2()