Example #1
0
def crawl(sport, year, division, org, game, url, neutral=False):
    global data
    data = data.format(sport, year, division)

    gamename = game.replace('/', '.')

    def readFlag(flag):
        if not os.path.exists(os.path.join(data, org, gamename)):
            os.mkdir(os.path.join(data, org, gamename))

        return os.path.exists(os.path.join(data, org, gamename, flag))

    def setFlag(flag):
        with open(os.path.join(data, org, gamename, flag), 'w') as f:
            pass

    if neutral and not readFlag(".neutral"):
        setFlag(".neutral")

    filename = os.path.join(data, org, gamename, "{}.csv")

    if not readFlag(".done"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))

            gs = parseURL(gamelink)

            sleep(2)

            gamescore = None
            gameinfo = None

            periods = []
            teams = []
            nextPeriod = 0
            for table in gs.select("div.header_menu a"):
                if (
                        table["href"] == "#" or
                        not (
                            table["href"].startswith("/game/box_score") or
                            table["href"].startswith("/game/play_by_play")
                        )
                    ):
                    continue

                tablelink = urljoin(domain, table["href"])
                print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink))

                ts = parseURL(tablelink)

                if gamescore is None:
                    gamescore = parseTable(ts.select("table:nth-of-type(1)")[0])
                    dumpTable(
                        gamescore,
                        filename.format("Score")
                    )

                if gameinfo is None:
                    gameinfo = transposeTable(
                        parseTable(ts.select("table:nth-of-type(3)")[0]) +
                        parseTable(ts.select("table:nth-of-type(4)")[0])
                    )
                    dumpTable(
                        gameinfo,
                        filename.format("Info")
                    )

                teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()]
                periods = [v.text.strip() for v in gamescore[0][1:]]

                if table["href"].startswith("/game/box_score"):
                    if table.text.strip() == "Box Score":
                        sfilename = filename.format("Box Score - {}")
                    else:
                        sfilename = filename.format(periods[nextPeriod] + " - {}")
                        nextPeriod += 1

                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(5)")[0], header=1),
                        sfilename.format(teams[0])
                    )
                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(6)")[0], header=1),
                        sfilename.format(teams[1])
                    )
                elif table["href"].startswith("/game/play_by_play"):
                    sfilename = filename.format("Play by Play - {}")

                    for (i, period) in enumerate(periods[:-1]):
                        dumpTable(
                            parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0),
                            sfilename.format(period)
                        )

                sleep(2)

            if gamescore == gameinfo == None:
                raise Exception("Not a game.")

            setFlag(".done")

            sleep(2)
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()

    if not readFlag(".parsed"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))
            print2("Parsing...")

            gamescore = loadTable(filename.format("Score"))

            sfilename = filename.format("Box Score - {}")
            teams = [gamescore[1][0], gamescore[2][0]]
            with open(filename.format("Box Score - All (Parsed)"), "w") as af:
                for team in teams:
                    boxScore = parseBoxScore(
                        sfilename.format(team),
                        filename.format("Info"),
                        team,
                        "All"
                    )

                    rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af)

            sfilename = filename.format("Play by Play - {}")
            periods = gamescore[0][1:]
            with open(filename.format("Play by Play - All (Parsed)"), "w") as af:
                for period in periods[:-1]:
                    playByPlay = parsePlayByPlay(
                        sfilename.format(period),
                        period,
                        filename.format("Info")
                    )

                    rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af)

            setFlag(".parsed")
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()
Example #2
0
        for link in cs.select("#contentarea > a"):
            if link.text.strip() == "Roster":
                tq.enqueue(
                    crawlTeam,
                    sport, year, division,
                    org.text.strip(),
                    "Roster",
                    urljoin(domain, link["href"]),
                    1,
                    at_front=atFront
                )

                break

        for game in parseTable(cs.select("table table:nth-of-type(1)")[0], header=2):
            # When no game is available
            if len(game) == 1 or game[-1].text.strip() == "-":
                continue

            team = org.text.strip()
            date = game[0].text.strip()
            neutral = game[1].text.strip().find("@") > 0
            url = urljoin(domain, game[2].select("a")[0]["href"])

            gq.enqueue(
                crawlGame,
                sport, year, division, team, date, url, neutral=neutral,
                at_front=atFront
            )
    except Exception as e: