コード例 #1
0
def crawl(sport, year, division, org, game, url, neutral=False):
    global data
    data = data.format(sport, year, division)

    gamename = game.replace('/', '.')

    def readFlag(flag):
        if not os.path.exists(os.path.join(data, org, gamename)):
            os.mkdir(os.path.join(data, org, gamename))

        return os.path.exists(os.path.join(data, org, gamename, flag))

    def setFlag(flag):
        with open(os.path.join(data, org, gamename, flag), 'w') as f:
            pass

    if neutral and not readFlag(".neutral"):
        setFlag(".neutral")

    filename = os.path.join(data, org, gamename, "{}.csv")

    if not readFlag(".done"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))

            gs = parseURL(gamelink)

            sleep(2)

            gamescore = None
            gameinfo = None

            periods = []
            teams = []
            nextPeriod = 0
            for table in gs.select("div.header_menu a"):
                if (
                        table["href"] == "#" or
                        not (
                            table["href"].startswith("/game/box_score") or
                            table["href"].startswith("/game/play_by_play")
                        )
                    ):
                    continue

                tablelink = urljoin(domain, table["href"])
                print2("{} \033[4m{}\033[0m".format(table.text.strip(), tablelink))

                ts = parseURL(tablelink)

                if gamescore is None:
                    gamescore = parseTable(ts.select("table:nth-of-type(1)")[0])
                    dumpTable(
                        gamescore,
                        filename.format("Score")
                    )

                if gameinfo is None:
                    gameinfo = transposeTable(
                        parseTable(ts.select("table:nth-of-type(3)")[0]) +
                        parseTable(ts.select("table:nth-of-type(4)")[0])
                    )
                    dumpTable(
                        gameinfo,
                        filename.format("Info")
                    )

                teams = [gamescore[1][0].text.strip(), gamescore[2][0].text.strip()]
                periods = [v.text.strip() for v in gamescore[0][1:]]

                if table["href"].startswith("/game/box_score"):
                    if table.text.strip() == "Box Score":
                        sfilename = filename.format("Box Score - {}")
                    else:
                        sfilename = filename.format(periods[nextPeriod] + " - {}")
                        nextPeriod += 1

                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(5)")[0], header=1),
                        sfilename.format(teams[0])
                    )
                    dumpTable(
                        parseTable(ts.select("table:nth-of-type(6)")[0], header=1),
                        sfilename.format(teams[1])
                    )
                elif table["href"].startswith("/game/play_by_play"):
                    sfilename = filename.format("Play by Play - {}")

                    for (i, period) in enumerate(periods[:-1]):
                        dumpTable(
                            parseTable(ts.select("table:nth-of-type({})".format(6 + 2 * i))[0], header=0),
                            sfilename.format(period)
                        )

                sleep(2)

            if gamescore == gameinfo == None:
                raise Exception("Not a game.")

            setFlag(".done")

            sleep(2)
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()

    if not readFlag(".parsed"):
        try:
            gamelink = urljoin(domain, url)
            log("{} {} {} {} {} {}".format(sport, year, division, org, game, dumpURL(gamelink)))
            print2("Parsing...")

            gamescore = loadTable(filename.format("Score"))

            sfilename = filename.format("Box Score - {}")
            teams = [gamescore[1][0], gamescore[2][0]]
            with open(filename.format("Box Score - All (Parsed)"), "w") as af:
                for team in teams:
                    boxScore = parseBoxScore(
                        sfilename.format(team),
                        filename.format("Info"),
                        team,
                        "All"
                    )

                    rawDumpTable(boxScore[(0 if team == teams[0] else 1):], af)

            sfilename = filename.format("Play by Play - {}")
            periods = gamescore[0][1:]
            with open(filename.format("Play by Play - All (Parsed)"), "w") as af:
                for period in periods[:-1]:
                    playByPlay = parsePlayByPlay(
                        sfilename.format(period),
                        period,
                        filename.format("Info")
                    )

                    rawDumpTable(playByPlay[(0 if period == periods[0] else 1):], af)

            setFlag(".parsed")
        except Exception as e:
            print2(colored("Error: ", "red"), e)
        finally:
            print2()
コード例 #2
0
ファイル: crawl.py プロジェクト: chuanconggao/NCAA-Crawler
)

gq = Queue(
    redisQueuePrefix + "game",
    connection=Redis(),
    default_timeout=redisQueueIndexTimeout
)

data = data.format(sport, year, division)

s = parseURL(
    urljoin(
        domain,
        "/team/inst_team_list?conf_id=-1"
    ),
    params={
        "sport_code": "MBB" if argv["Men"] else "WBB",
        "academic_year": year,
        "division": division
    }
)

sleep(2)

for org in s.select("table a"):
    orgname = org.text.strip()

    if filterOrg != None and filterOrg != orgname:
        continue

    try: