def crawl(sport, year, division, org, game, url, neutral=False): global data data = data.format(sport, year, division) gamename = game.replace('/', '.') def readFlag(flag): if not os.path.exists(os.path.join(data, org, gamename)): os.mkdir(os.path.join(data, org, gamename)) return os.path.exists(os.path.join(data, org, gamename, flag)) def setFlag(flag): with open(os.path.join(data, org, gamename, flag), 'w') as f: pass if neutral and not readFlag(".neutral"): setFlag(".neutral") filename = os.path.join(data, org, gamename, "{}.csv") if not readFlag(".done"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) gs = parseURL(gamelink) sleep(2) gamescore = None gameinfo = None periods = [] teams = [] nextPeriod = 0 for table in gs.select("div.header_menu a"): if ( table["href"] == "#" or not ( table["href"].startswith("/game/box_score") or table["href"].startswith("/game/play_by_play") ) ): continue tablelink = urljoin(domain, table["href"]) print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink)) ts = parseURL(tablelink) if gamescore is None: gamescore = parseTable(ts.select("table:nth-of-type(1)")[0]) dumpTable( gamescore, filename.format("Score") ) if gameinfo is None: gameinfo = transposeTable( parseTable(ts.select("table:nth-of-type(3)")[0]) + parseTable(ts.select("table:nth-of-type(4)")[0]) ) dumpTable( gameinfo, filename.format("Info") ) teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()] periods = [v.text.strip() for v in gamescore[0][1:]] if table["href"].startswith("/game/box_score"): if table.text.strip() == "Box Score": sfilename = filename.format("Box Score - {}") else: sfilename = filename.format(periods[nextPeriod] + " - {}") nextPeriod += 1 dumpTable( parseTable(ts.select("table:nth-of-type(5)")[0], header=1), sfilename.format(teams[0]) ) dumpTable( parseTable(ts.select("table:nth-of-type(6)")[0], header=1), sfilename.format(teams[1]) ) elif table["href"].startswith("/game/play_by_play"): sfilename = filename.format("Play by Play - {}") for (i, period) in enumerate(periods[:-1]): dumpTable( parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0), sfilename.format(period) ) sleep(2) if gamescore == gameinfo == None: raise Exception("Not a game.") setFlag(".done") sleep(2) except Exception as e: print2(colored("Error: ", "red"), e) finally: print2() if not readFlag(".parsed"): try: gamelink = urljoin(domain, url) log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink))) print2("Parsing...") gamescore = loadTable(filename.format("Score")) sfilename = filename.format("Box Score - {}") teams = [gamescore[1][0], gamescore[2][0]] with open(filename.format("Box Score - All (Parsed)"), "w") as af: for team in teams: boxScore = parseBoxScore( sfilename.format(team), filename.format("Info"), team, "All" ) rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af) sfilename = filename.format("Play by Play - {}") periods = gamescore[0][1:] with open(filename.format("Play by Play - All (Parsed)"), "w") as af: for period in periods[:-1]: playByPlay = parsePlayByPlay( sfilename.format(period), period, filename.format("Info") ) rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af) setFlag(".parsed") except Exception as e: print2(colored("Error: ", "red"), e) finally: print2()
) gq = Queue( redisQueuePrefix + "game", connection=Redis(), default_timeout=redisQueueIndexTimeout ) data = data.format(sport, year, division) s = parseURL( urljoin( domain, "/team/inst_team_list?conf_id=-1" ), params={ "sport_code": "MBB" if argv["Men"] else "WBB", "academic_year": year, "division": division } ) sleep(2) for org in s.select("table a"): orgname = org.text.strip() if filterOrg != None and filterOrg != orgname: continue try: