def crawl(sport, year, division, org, game, url, neutral=False): global data data = data.format(sport, year, division) gamename = game.replace('/', '.') def readFlag(flag): if not os.path.exists(os.path.join(data, org, gamename)): os.mkdir(os.path.join(data, org, gamename)) return os.path.exists(os.path.join(data, org, gamename, flag)) def setFlag(flag): with open(os.path.join(data, org, gamename, flag), 'w') as f: pass if neutral and not readFlag(".neutral"): setFlag(".neutral") filename = os.path.join(data, org, gamename, "{}.csv") if not readFlag(".done"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) gs = parseURL(gamelink) sleep(2) gamescore = None gameinfo = None periods = [] teams = [] nextPeriod = 0 for table in gs.select("div.header_menu a"): if ( table["href"] == "#" or not ( table["href"].startswith("/game/box_score") or table["href"].startswith("/game/play_by_play") ) ): continue tablelink = urljoin(domain, table["href"]) print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink)) ts = parseURL(tablelink) if gamescore is None: gamescore = parseTable(ts.select("table:nth-of-type(1)")[0]) dumpTable( gamescore, filename.format("Score") ) if gameinfo is None: gameinfo = transposeTable( parseTable(ts.select("table:nth-of-type(3)")[0]) + parseTable(ts.select("table:nth-of-type(4)")[0]) ) dumpTable( gameinfo, filename.format("Info") ) teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()] periods = [v.text.strip() for v in gamescore[0][1:]] if table["href"].startswith("/game/box_score"): if table.text.strip() == "Box Score": sfilename = filename.format("Box Score - {}") else: sfilename = filename.format(periods[nextPeriod] + " - {}") nextPeriod += 1 dumpTable( parseTable(ts.select("table:nth-of-type(5)")[0], header=1), sfilename.format(teams[0]) ) dumpTable( parseTable(ts.select("table:nth-of-type(6)")[0], header=1), sfilename.format(teams[1]) ) elif table["href"].startswith("/game/play_by_play"): sfilename = filename.format("Play by Play - {}") for (i, period) in enumerate(periods[:-1]): dumpTable( parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0), sfilename.format(period) ) sleep(2) if gamescore == gameinfo == None: raise Exception("Not a game.") setFlag(".done") sleep(2) except Exception as e: print2(colored("Error: ", "red"), e) finally: print2() if not readFlag(".parsed"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) print2("Parsing...") gamescore = loadTable(filename.format("Score")) sfilename = filename.format("Box Score - {}") teams = [gamescore[1][0], gamescore[2][0]] with open(filename.format("Box Score - All (Parsed)"), "w") as af: for team in teams: boxScore = parseBoxScore( sfilename.format(team), filename.format("Info"), team, "All" ) rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af) sfilename = filename.format("Play by Play - {}") periods = gamescore[0][1:] with open(filename.format("Play by Play - All (Parsed)"), "w") as af: for period in periods[:-1]: playByPlay = parsePlayByPlay( sfilename.format(period), period, filename.format("Info") ) rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af) setFlag(".parsed") except Exception as e: print2(colored("Error: ", "red"), e) finally: print2()
if player != "Team": if r[-2] == "Leaves Game": if player in initPlayers[i]: initPlayers[i].remove(player) else: initPlayers[i].add(player) lineup = "; ".join(sorted(initPlayers[i])) r.append(lineup) lineupTime = r[3] if lastLineup[i] == None: lastLineup[i] = lineup lineupTimes[i][lastLineup[i]] = lineupTimes[i].setdefault(lastLineup[i], 0) + computeSeconds(lastLineupTime[i]) - computeSeconds(lineupTime) r.append(computeTime(lineupTimes[i].setdefault(lineup, 0))) lastLineup[i], lastLineupTime[i] = lineup, lineupTime addLineup(table) return table if __name__ == "__main__": rawDumpTable( parse(sys.argv[1], sys.argv[2], sys.argv[3]), sys.stdout )
try: box = loadTable(join(path, 'Box Score - All (Parsed).csv')) row = [r for r in box[1:] if r[3] == org][-1] isHome = row[3] == box[-1][3] home = "Home" if isHome else "Away" opponent = box[1 if isHome else -1][3] info = loadTable(join(path, 'Info.csv')) location, attendance = "", "" for (c, v) in zip(info[0], info[1]): if c == "Location:": location = v elif c == "Attendance:": attendance = v row = [gender, year, div] + row[:2] + [org, opponent, home] + row[6:] + [location, attendance] output.append(row) except: print("Exception: {}".format(path), file=sys.stderr) for (gender, genderDir) in ls(sys.argv[1]): for (year, yearDir) in ls(genderDir): for (div, divDir) in ls(yearDir): for (org, orgDir) in ls(divDir): for (game, gameDir) in ls(orgDir): extract(gender, year, div, org, game, gameDir) rawDumpTable(output, sys.stdout)