Esempio n. 1
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html.parser',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(
        week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)

    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
Esempio n. 2
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count()/2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)
    
    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    
    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (role, category, season, seasonTypes,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 3
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)
    
    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
Esempio n. 4
0
def run(wait):
    """
    """
    logger = makeLogger('main', r'./logs_pfrPlayerStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))

    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)

    player_tuples = []
    for letter in list(string.ascii_uppercase):
        wait = random.uniform(.5, 1.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Opening players %s', letter)
        browser = open_or_follow_link(
            logger, browser, 'open',
            "http://www.pro-football-reference.com/players/{}/".format(letter))
        players = browser.find(id="div_players")

        for player in players.find_all('p'):
            player = player.find('a')
            player_tuples.append((player.text, player['href']))

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    logger.debug('Processing %d Players', len(player_tuples))
    for player_tuple in player_tuples:
        #parsePlayer(player_tuple[0], player_tuple[1])
        pool.apply_async(parsePlayer, (
            player_tuple[0],
            player_tuple[1],
        ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 5
0
def parseTeam(team_url, team_name):
    """
    parses a teams page returns a list of year urls
    there is some data on this page that would be usefull to scrape in the future
    """
    logger = makeLogger(cleanKey(team_name), r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('Starting %s', team_name)

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', team_url)
    table = browser.find(id='team_index').find('tbody')
    year_columns = table.find_all('th')

    year_url_tups = []
    for index, year_column in enumerate(year_columns):
        logger.debug('Row %d of %d', index, len(year_columns))
        try:
            year_link = year_column.find('a')
            if year_link:
                year_url = 'http://www.pro-football-reference.com' + year_link[
                    'href']
                year = convertToNumber(year_link.text)
                if not isinstance(year, int):
                    continue
                year_url_tups.append((team_name, year_url, year))
        except:
            logger.exception(year_column)

    logger.debug('parseTeam time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)

    return year_url_tups
Esempio n. 6
0
def parseTeam(team_url, team_name):
    """
    parses a teams page returns a list of year urls
    there is some data on this page that would be usefull to scrape in the future
    """
    logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("Starting %s", team_name)

    wait = random.uniform(1.5, 3.5)
    logger.debug("Waiting %f", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", team_url)
    table = browser.find(id="team_index").find("tbody")
    year_columns = table.find_all("th")

    year_url_tups = []
    for index, year_column in enumerate(year_columns):
        logger.debug("Row %d of %d", index, len(year_columns))
        try:
            year_link = year_column.find("a")
            if year_link:
                year_url = "http://www.pro-football-reference.com" + year_link["href"]
                year = convertToNumber(year_link.text)
                if not isinstance(year, int):
                    continue
                year_url_tups.append((team_name, year_url, year))
        except:
            logger.exception(year_column)

    logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime))

    closeLogger(logger)

    return year_url_tups
def run(wait):
    """
    """
    logger = makeLogger('main', r'./logs_pfrPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
     
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    
    player_tuples = []
    for letter in list(string.ascii_uppercase):
        wait = random.uniform(.5,1.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Opening players %s', letter)
        browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter))
        players = browser.find(id="div_players")

        for player in players.find_all('p'):
            player = player.find('a')
            player_tuples.append((player.text, player['href']))

    pool = Pool(processes=int(get_proxy_count()/2.5))

    logger.debug('Processing %d Players', len(player_tuples))
    for player_tuple in player_tuples:
        #parsePlayer(player_tuple[0], player_tuple[1])
        pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],))


    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 8
0
def parsePlayer(playerUrl):
    """
    Starting point to parsing all pages for a given player.
    Collects all links for a player and calls the approiate parsers for each link.
    """

    startTime = datetime.now()

    playerId = re.search('.*?id=(.*)\?*', playerUrl).group(1)
    logger = makeLogger(playerId, r'./logs_nflPlayerStats/')

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)

    wait = random.uniform(2, 4)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)
    logger.debug('Opening %s', playerUrl)
    browser = open_or_follow_link(logger, browser, 'open', playerUrl)

    #gets the actual playerUrl, the orignal value gets redirected
    playerUrl_api = playerUrl
    playerUrl = browser.url

    try:
        #parsePlayer BIO
        playerBio = browser.find(class_="player-info")
        player_profile_id = parsePlayerBio(logger, playerBio, playerUrl,
                                           playerUrl_api)
        if not player_profile_id:
            logger.debug('New player profile not made, skiping rest of tabs')
            return

        #Gets the links for each category tab, i.e Profile, career stats, game logs ...
        tabNames = [
            tabName['href']
            for tabName in browser.find(id="player-profile-tabs").find_all('a')
        ]
        for tabName in tabNames:
            if tabName == 'profile':
                continue

            playerUrl = getPlayerTabUrl(playerUrl, tabName)

            wait = random.uniform(1.5, 3.5)
            logger.debug('Waiting %f', wait)
            time.sleep(wait)
            logger.debug('Opening %s', playerUrl)
            browser = open_or_follow_link(logger, browser, 'open', playerUrl)

            if tabName == 'careerstats':
                #parse careerstats
                careerStats = browser.find(id="player-stats-wrapper")
                careerStats = careerStats.find_all("table")
                parseCareerStats(logger, careerStats, player_profile_id)

            elif tabName == 'gamelogs':
                #Get the list of years
                gameLogYears = browser.find(id="criteria")
                gameLogYears = gameLogYears.find_all("option")
                yearsList = []
                for year in gameLogYears:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)

                #parse the first year of gameLogs since its already loaded
                gameLogs = browser.find(id="player-stats-wrapper")
                gameLogs = gameLogs.find_all("table")
                parseGameLogs(logger, gameLogs, yearsList[0],
                              player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl,
                                                tabName) + '?season=' + year
                    wait = random.uniform(1.5, 3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open',
                                                  playerUrl)
                    gameLogs = browser.find(id="player-stats-wrapper")
                    gameLogs = gameLogs.find_all("table")
                    parseGameLogs(logger, gameLogs, year, player_profile_id)

            elif tabName == 'gamesplits':
                #Get the list of years
                years = browser.find(id="criteria")
                years = years.find_all("option")
                yearsList = []
                for year in years:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)

                #parse the first year of gamesplits since its already loaded
                gameSplits = browser.find(id="player-stats-wrapper")
                parseSplits(logger, gameSplits, yearsList[0], 'game',
                            player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl,
                                                tabName) + '?season=' + year
                    wait = random.uniform(1.5, 3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open',
                                                  playerUrl)
                    gameSplits = browser.find(id="player-stats-wrapper")
                    parseSplits(logger, gameSplits, year, 'game',
                                player_profile_id)

            elif tabName == 'situationalstats':
                #Get the list of years
                years = browser.find(id="criteria")
                years = years.find_all("option")
                yearsList = []
                for year in years:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)

                #parse the first year of gamesplits since its already loaded
                situationalStats = browser.find(id="player-stats-wrapper")
                parseSplits(logger, situationalStats, yearsList[0],
                            'situational', player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl,
                                                tabName) + '?season=' + year
                    wait = random.uniform(1.5, 3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open',
                                                  playerUrl)
                    situationalStats = browser.find(id="player-stats-wrapper")
                    parseSplits(logger, situationalStats, year, 'situational',
                                player_profile_id)

            elif tabName == 'draft':
                draft = browser.find(id="player-stats-wrapper")
                parseDraft(logger, draft, player_profile_id)
            elif tabName == 'combine':
                combine = browser.find(id="player-stats-wrapper")
                parseCombine(logger, combine, player_profile_id)
    except:
        logger.exception('Failed parsing player')

    logger.debug('parsePlayer time elapsed: ' +
                 str(datetime.now() - startTime))

    closeLogger(playerId)
Esempio n. 9
0
def parseSeason(role, category, season, seasonTypes):
    """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season
    doesnt follow any links
    some years dont have any info, but still return a page.
    These are loged with Exception('No teams found %s' % url)
    All data is stored in team_stats
    """
    logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('Starting %s %s %s', role.text, category.text, season.text)

    teamStat_list = []
    for seasonType in seasonTypes:
        if seasonType.text == "Season Type...":
            continue

        team_stats_query = {'year': convertToNumber(removeNewLine(season.text)),
            'seasonType': removeNewLine(seasonType.text),
            'role': removeNewLine(role.text),
            'category': removeNewLine(category.text)
        }

        if col_team_stats.find(team_stats_query).count():
            logger.debug('Already parsed %s', team_stats_query)
            continue

        wait = random.uniform(1.5,3.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Starting: %s', team_stats_query)
        url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value']
        try:
            if role.text == "Offense":
                categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null'
                
            elif role.text == "Defense":
                categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value']
            else:
                raise Exception('Unsupported role: %s', role.text)
            
            url += categoryUrl
            url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go'

            logger.debug('Opening: %s', url)
            browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

            tries = 0
            # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
            while not result:
                if tries > 10:
                    raise Exception('No teams found %s' % url)
                elif tries > 0:
                    time.sleep(random.uniform(5, 7))
                tries += 1
                logger.debug('No result-tries: %d', tries)
                browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
                browser = open_or_follow_link(logger, browser, 'open', url)
                result = browser.find(id="result")

            tbodies = result.find_all("tbody")
            if len(tbodies) != 2:
                raise Exception("error parsing result")
            tableKey = tbodies[0]
            tableKeyRows = tableKey.find_all("tr")
            topTableKeys = []
            if len(tableKeyRows) == 1:
                tableKey = tableKey.find_all("th")
            elif len(tableKeyRows) == 2:
                topTableColumns = tableKeyRows[0].find_all("th")
                for topTableColumn in topTableColumns:
                    for _ in range(int(topTableColumn['colspan'])):
                        topTableKeys.append(topTableColumn.text)
                tableKey = tableKeyRows[1].find_all("th")
            else:
                raise Exception('To many header rows found')

            tableItems = tbodies[1]
            tableItems = tableItems.find_all("td")

            tableColumn = 0
            teamStatDict = {}
            for tableIndex, tableItem in enumerate(tableItems):
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    teamStatDict['team'] = removeNewLine(tableItem.text)
                    teamStatDict['year'] = int(removeNewLine(season.text))
                    teamStatDict['seasonType'] = removeNewLine(seasonType.text)
                    teamStatDict['role'] = removeNewLine(role.text)
                    teamStatDict['category'] = removeNewLine(category.text)
                    tableColumn += 1
                    continue

                if topTableKeys and topTableKeys[tableColumn]:
                    key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text
                else:
                    key = tableKey[tableColumn].text
                key = cleanKey(removeNewLine(key))
                value = convertToNumber(removeNewLine(tableItem.text))
                teamStatDict[key] = value

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    teamStat_list.append(teamStatDict)
                    teamStatDict = {}
                    tableColumn = 0
        except:
            logger.exception('row fail')

    try:
        if teamStat_list:
            logger.debug('Bulk Creating teamStat_list')
            col_team_stats.insert_many(teamStat_list)
    except:
        logger.exception('insert_many error')

    logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(role.text + '_' + category.text)
Esempio n. 10
0
def parseYear(team_name, year_url, year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(
        cleanKey(team_name) + '_' + str(year), r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_team_stats_weekly = db['team_stats_weekly']

    #need to fix this to actually detect duplicate
    # if col_team_stats_weekly.find({'year': year}).count():
    #     logger.debug('Already parsed %s', year)
    #     closeLogger(logger)
    #     return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', year_url)
    table = browser.find(id='games')
    rows = table.find_all('tr')
    header = [
        cleanKey(each.attrs['data-stat']) for each in rows[0].find_all('th')
    ]
    rows = rows[1:]

    row_dicts = []
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            week_number = convertToNumber(row.find('th').text)
            row_values = [
                convertToNumber(value.text) for value in row.find_all('td')
            ]
            row_values.insert(0, week_number)
            row_dict = dict(zip(header, row_values))
            row_dict['year'] = year
            row_dict['team_name'] = team_name
            row_dict['year_url'] = year_url

            if row_dict['game_date'].lower() == 'playoffs':
                continue

            row_dicts.append(row_dict)
        except:
            logger.exception(row)

    logger.debug('team_stats_weekly.inert_many')

    col_team_stats_weekly.insert_many(row_dicts)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 11
0
def run(wait):
    """
    First collects a set of playerUrls to parse using, parsePlayerNames.
    Then parses each player.
    Both tasks use multiprocessing
    """

    logger = makeLogger('main', r'./logs_nflPlayerStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE'
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    statisticCategory = browser.find(id="statistic-category")
    statisticCategories = statisticCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for statisticCategory in statisticCategories:
        if statisticCategory.text == 'Category...':
            continue
        for season in seasons:
            if season.text == 'Season...':
                continue
            for seasonType in seasonTypes:
                if seasonType.text == 'Season Type...':
                    continue
                results.append(
                    pool.apply_async(parsePlayerNames, (
                        statisticCategory['value'],
                        season['value'],
                        seasonType['value'],
                    )))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    playerUrl_set = set()
    for result in results:
        try:
            result_set = result.get()
            if result_set:
                playerUrl_set = playerUrl_set.union(result_set)
        except:
            logger.exception('Error in parsePlayerNames worker')

    with open('../playerUrl_set.json', 'w') as playerUrl_json:
        playerUrl_json.write(json.dumps(list(playerUrl_set)))

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    logger.debug('Starting to parse %d players', len(playerUrl_set))
    for playerUrl in playerUrl_set:
        if col_player_profiles.find({'player_url': playerUrl}).count():
            logger.debug('Skipping ' + playerUrl)
            continue
        #parsePlayer(playerUrl)
        pool.apply_async(parsePlayer, (playerUrl, ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 12
0
def parseYear(year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(year, r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_schedule = db['schedule']
    col_game_info = db['game_info']
    col_failed_game_info = db['failed_game_info']

    if col_schedule.find({'year': year}).count():
        logger.debug('Already parsed %s', year)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year))
    table = browser.find(id='games')
    rows = table.find_all('tr')
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            schedule_dict = {}
            gameInfo_dict = {}
            columns = row.find_all('td')
            if columns:
                schedule_dict['week'] = convertToNumber(columns[0].text)
                schedule_dict['day'] = columns[1].text
                schedule_dict['date'] = columns[2].text
                schedule_dict['year'] = convertToNumber(year)
                homeIndicator = columns[5].text
                if homeIndicator == '@':
                    schedule_dict['homeTeam'] = columns[6].text
                    schedule_dict['awayTeam'] = columns[4].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text)
                else:
                    schedule_dict['homeTeam'] = columns[4].text
                    schedule_dict['awayTeam'] = columns[6].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text)
                gameInfo_dict['week'] = convertToNumber(columns[0].text)
                gameInfo_dict['year'] = convertToNumber(year)
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                url = columns[3].find('a')
                if url:
                    url = 'http://www.pro-football-reference.com' + url['href']
                    failed_game_info = True
                    browser = open_or_follow_link(logger, browser, 'open', url)
                    game_info = browser.find(id="game_info")
                    if game_info:
                        for each in game_info.find_all('tr'):
                            pair = each.find_all('td')
                            if pair:
                                failed_game_info = False
                                key = pair[0].text
                                value = convertToNumber(pair[1].text)
                                gameInfo_dict[cleanKey(key)] = convertToNumber(value)
                    if failed_game_info:
                        failed_dict = schedule_dict
                        failed_dict['row'] = index
                        failed_dict['href'] = url['href']
                        col_failed_game_info.insert(failed_dict)
                        gameInfo_dict['FAIL'] = True

                schedule_list.append(schedule_dict)
                gameInfo_list.append(gameInfo_dict)
        except:
            logger.exception(row)

    logger.debug('nfl_schedule.inert_many')

    schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids
    
    logger.debug('mapping nfl_schedule.id to gameInfo_list')

    for index, schedule_id in enumerate(schedule_ids):
        if len(gameInfo_list[index].keys()) <= 2:
            logger.debug('Empty game_info: %s', schedule_id)
        gameInfo_list[index]['schedule_id'] = schedule_id

    logger.debug('game_info.insert_many')
    col_game_info.insert_many(gameInfo_list)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(year)
Esempio n. 13
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger('main', r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))

    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://www.pro-football-reference.com/teams/")
    table_body = browser.find(id='teams_active').find('tbody')
    rows = table_body.find_all('tr')

    team_url_tups = []

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            team_link = row.find('th').find('a')
            if team_link:
                team_url = 'http://www.pro-football-reference.com' + team_link[
                    'href']
                team_name = team_link.text
                team_url_tups.append((team_url, team_name))
        except:
            logger.exception(row)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    for team_url, team_name in team_url_tups:
        #print parseTeam(team_url, team_name)
        results.append(pool.apply_async(parseTeam, (
            team_url,
            team_name,
        )))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    year_url_tups = []
    for result in results:
        year_url_tup = result.get()
        if year_url_tup:
            year_url_tups += (year_url_tup)

    logger.debug('Done gathering %d year urls', len(year_url_tups))

    pool = Pool(processes=int(get_proxy_count() / 2))

    logger.debug('Shuffling year_urls')
    random.shuffle(year_url_tups)
    logger.debug('Starting to parse year_urls')
    for team_name, year_url, year in year_url_tups:
        #parseYear(team_name, year_url, year)
        pool.apply_async(parseYear, (
            team_name,
            year_url,
            year,
        ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 14
0
def parsePlayer(player_name, player_url):
    player_url = "http://www.pro-football-reference.com" + player_url

    logger = makeLogger(player_name, r'./logs_pfrPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_pfr_player_bio = db['pfr_player_bio']
    col_pfr_player_game_stats = db['pfr_player_game_stats']
    col_pfr_player_career_stats = db['pfr_player_career_stats']

    if col_pfr_player_bio.find({'player_url': player_url}).count():
        logger.debug('Player already parsed ' + player_url)
        return

    wait = random.uniform(1.5,3)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)
    
    logger.debug('Opening player page %s', player_url)
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', player_url + '/gamelog')

    logger.debug('Parsing player meta')
    meta_div = browser.find(id="meta")
    meta_items = None
    for div in meta_div.find_all('div'):
        try:
            if div['itemtype'] == 'http://schema.org/Person':
                meta_items = div.find_all('p')
        except KeyError:
            pass

    player_bio = {
        'player_url': player_url,
        'player_name': player_name
    }
    for meta_item in meta_items:
        physical_stat_row = False
        item_spans = meta_item.find_all('span')
        for item_span in item_spans:
            try:
                if item_span['itemprop'] == 'height':
                    physical_stat_row = True
                    player_bio['height'] = item_span.text
                elif item_span['itemprop'] == 'weight':
                    physical_stat_row = True
                    player_bio['weight'] = item_span.text
            except KeyError:
                pass

        if physical_stat_row:
            continue

        key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text)
        for key, value in key_values:
            player_bio[cleanKey(key.replace(u'\xa0', u' '))] = value.strip().replace(u'\xa0', u' ')

    try:
        logger.debug('Creating player bio')
        player_bio_id = col_pfr_player_bio.insert(player_bio)
    except:
        logger.exception('insert error')
        return

    try:
        regular_season_div = browser.find(id='all_stats')
        regular_season_table = regular_season_div.find(class_="table_outer_container").find(id="div_stats")
    except AttributeError:
        logger.debug('No game logs, exiting player')
        return

    career_total_dicts = []
    try:
        game_stat_dicts, career_total_dict = parseTable(logger, player_bio_id, regular_season_table, 'regular season')
        career_total_dicts.append(career_total_dict)
    except:
        logger.exception('parseTable error. Deleting user bio and exiting')
        col_pfr_player_bio.remove({'player_url': player_url})
        return

    

    playoff_table = browser.find(id="stats_playoffs")
    if not playoff_table:
        logger.debug('No playoff game logs')
    else:
        try:
            temp_game_dicts, career_total_dict = parseTable(logger, player_bio_id, playoff_table, 'playoffs')
            game_stat_dicts += temp_game_dicts
            career_total_dicts.append(career_total_dict)
        except:
            logger.exception('parseTable error. Deleting user bio and exiting')
            col_pfr_player_bio.remove({'player_url': player_url})
            return
    
    try:
        logger.debug('Bulk Creating game_stat_dicts')
        if game_stat_dicts:
            col_pfr_player_game_stats.insert_many(game_stat_dicts)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    try:
        logger.debug('Bulk Creating career_total_dicts')
        if career_total_dict:
            col_pfr_player_career_stats.insert_many(career_total_dicts)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 15
0
def parsePlayerNames(statisticCategory, season, seasonType):
    """
    Collects a set of player names from player stats by category tab.
    This parses just 1 of the options given in the form. Uses with pool.async
    """
    
    startTime = datetime.now()
    
    logName = statisticCategory + '_' + season + '_' + seasonType
    logger = makeLogger(logName, r'./logs_nflPlayerStats/')

    logger.debug('Starting parsePlayerNames')

    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)

    playerUrl_set = set()
    loadNextPage = True
    pageNumber = 1
    while(loadNextPage):
        logger.debug('Page %d', pageNumber)
        url = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=' + statisticCategory + '&qualified=true&season=' + season + '&seasonType=' + seasonType + '&d-447263-p=' + str(pageNumber)
        browser = open_or_follow_link(logger, browser, 'open', url)
        pageNumber += 1
        linkNavigation = browser.find(class_='linkNavigation')
        if not linkNavigation or pageNumber > len(linkNavigation.find_all('a')):
            loadNextPage = False

        result = browser.find(id="result")

        tries = 0
        # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
        while not result:
            if tries > 3:
                raise Exception('No teams found %s' % url)
            elif tries > 0:
                time.sleep(random.uniform(5, 7))
            tries += 1
            logger.debug('No result-tries: %d', tries)
            browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

        tbodies = result.find_all("tbody")
        if len(tbodies) != 2:
            raise Exception("error parsing result")
        
        tableKey = tbodies[0]
        tableKey = tableKey.find_all("th")

        tableItems = tbodies[1]
        tableItems = tableItems.find_all("td")

        tableColumn = 0
        teamStatDict = {}
        for tableIndex, tableItem in enumerate(tableItems):
            try:
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    playerUrl_set.add('http://www.nfl.com' + tableItem.find('a')['href'])

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    tableColumn = 0
            except:
                logger.exception('failed parsing row %d of %d', tableIndex, len(tableItems))

    logger.debug('parsePlayerNames time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logName)
    
    return playerUrl_set
Esempio n. 16
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(
                        removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (
                    role,
                    category,
                    season,
                    seasonTypes,
                ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 17
0
def parsePlayerNames(statisticCategory, season, seasonType):
    """
    Collects a set of player names from player stats by category tab.
    This parses just 1 of the options given in the form. Uses with pool.async
    """

    startTime = datetime.now()

    logName = statisticCategory + '_' + season + '_' + seasonType
    logger = makeLogger(logName, r'./logs_nflPlayerStats/')

    logger.debug('Starting parsePlayerNames')

    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)

    playerUrl_set = set()
    loadNextPage = True
    pageNumber = 1
    while (loadNextPage):
        logger.debug('Page %d', pageNumber)
        url = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=' + statisticCategory + '&qualified=true&season=' + season + '&seasonType=' + seasonType + '&d-447263-p=' + str(
            pageNumber)
        browser = open_or_follow_link(logger, browser, 'open', url)
        pageNumber += 1
        linkNavigation = browser.find(class_='linkNavigation')
        if not linkNavigation or pageNumber > len(
                linkNavigation.find_all('a')):
            loadNextPage = False

        result = browser.find(id="result")

        tries = 0
        # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
        while not result:
            if tries > 3:
                raise Exception('No teams found %s' % url)
            elif tries > 0:
                time.sleep(random.uniform(5, 7))
            tries += 1
            logger.debug('No result-tries: %d', tries)
            browser = RoboBrowser(history=False,
                                  parser='html5lib',
                                  user_agent=get_user_agent(logger),
                                  timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

        tbodies = result.find_all("tbody")
        if len(tbodies) != 2:
            raise Exception("error parsing result")

        tableKey = tbodies[0]
        tableKey = tableKey.find_all("th")

        tableItems = tbodies[1]
        tableItems = tableItems.find_all("td")

        tableColumn = 0
        teamStatDict = {}
        for tableIndex, tableItem in enumerate(tableItems):
            try:
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    playerUrl_set.add('http://www.nfl.com' +
                                      tableItem.find('a')['href'])

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    tableColumn = 0
            except:
                logger.exception('failed parsing row %d of %d', tableIndex,
                                 len(tableItems))

    logger.debug('parsePlayerNames time elapsed: ' +
                 str(datetime.now() - startTime))

    closeLogger(logName)

    return playerUrl_set
Esempio n. 18
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger("main", r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("start time: " + str(startTime))

    logger.debug("waiting %d seconds", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", "http://www.pro-football-reference.com/teams/")
    table_body = browser.find(id="teams_active").find("tbody")
    rows = table_body.find_all("tr")

    team_url_tups = []

    for index, row in enumerate(rows):
        logger.debug("Row %d of %d", index, len(rows))
        try:
            team_link = row.find("th").find("a")
            if team_link:
                team_url = "http://www.pro-football-reference.com" + team_link["href"]
                team_name = team_link.text
                team_url_tups.append((team_url, team_name))
        except:
            logger.exception(row)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    for team_url, team_name in team_url_tups:
        # print parseTeam(team_url, team_name)
        results.append(pool.apply_async(parseTeam, (team_url, team_name)))

    pool.close()  # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join()  # Wait for the worker processes to exit. One must call close() or terminate() before using join().

    year_url_tups = []
    for result in results:
        year_url_tup = result.get()
        if year_url_tup:
            year_url_tups += year_url_tup

    logger.debug("Done gathering %d year urls", len(year_url_tups))

    pool = Pool(processes=int(get_proxy_count() / 2))

    logger.debug("Shuffling year_urls")
    random.shuffle(year_url_tups)
    logger.debug("Starting to parse year_urls")
    for team_name, year_url, year in year_url_tups:
        # parseYear(team_name, year_url, year)
        pool.apply_async(parseYear, (team_name, year_url, year))

    pool.close()  # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join()  # Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug("run time: " + str(datetime.now() - startTime))

    closeLogger("main")
Esempio n. 19
0
def run(wait):
    """
    First collects a set of playerUrls to parse using, parsePlayerNames.
    Then parses each player.
    Both tasks use multiprocessing
    """

    logger = makeLogger('main', r'./logs_nflPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)
    
    pool = Pool(processes=int(get_proxy_count()/2.5))
    results = []

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE'
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    statisticCategory = browser.find(id="statistic-category")
    statisticCategories = statisticCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for statisticCategory in statisticCategories:
        if statisticCategory.text == 'Category...':
            continue
        for season in seasons:
            if season.text == 'Season...':
                continue
            for seasonType in seasonTypes:
                if seasonType.text == 'Season Type...':
                    continue
                results.append(pool.apply_async(parsePlayerNames, (statisticCategory['value'], season['value'], seasonType['value'],)))
    
    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    playerUrl_set = set()
    for result in results:
        try:
            result_set = result.get()
            if result_set:
                playerUrl_set = playerUrl_set.union(result_set)
        except:
            logger.exception('Error in parsePlayerNames worker')

    with open('../playerUrl_set.json', 'w') as playerUrl_json:
        playerUrl_json.write(json.dumps(list(playerUrl_set)))

    pool = Pool(processes=int(get_proxy_count()/2.5))

    logger.debug('Starting to parse %d players', len(playerUrl_set))
    for playerUrl in playerUrl_set:
        if col_player_profiles.find({'player_url': playerUrl}).count():
            logger.debug('Skipping ' + playerUrl)
            continue
        #parsePlayer(playerUrl)
        pool.apply_async(parsePlayer, (playerUrl,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

        
    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 20
0
def parsePlayer(playerUrl):
    """
    Starting point to parsing all pages for a given player.
    Collects all links for a player and calls the approiate parsers for each link.
    """

    startTime = datetime.now()

    playerId = re.search('.*?id=(.*)\?*', playerUrl).group(1)
    logger = makeLogger(playerId, r'./logs_nflPlayerStats/')

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    
    wait = random.uniform(2,4)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)
    logger.debug('Opening %s', playerUrl)
    browser = open_or_follow_link(logger, browser, 'open', playerUrl)
    
    #gets the actual playerUrl, the orignal value gets redirected
    playerUrl_api = playerUrl
    playerUrl = browser.url

    try:
        #parsePlayer BIO
        playerBio = browser.find(class_="player-info")
        player_profile_id = parsePlayerBio(logger, playerBio, playerUrl, playerUrl_api)
        if not player_profile_id:
            logger.debug('New player profile not made, skiping rest of tabs')
            return

        #Gets the links for each category tab, i.e Profile, career stats, game logs ...    
        tabNames = [tabName['href'] for tabName in browser.find(id="player-profile-tabs").find_all('a')]
        for tabName in tabNames:
            if tabName == 'profile':
                continue

            playerUrl = getPlayerTabUrl(playerUrl, tabName)

            wait = random.uniform(1.5,3.5)
            logger.debug('Waiting %f', wait)
            time.sleep(wait)
            logger.debug('Opening %s', playerUrl)
            browser = open_or_follow_link(logger, browser, 'open', playerUrl)


            if tabName == 'careerstats':
                #parse careerstats
                careerStats = browser.find(id="player-stats-wrapper")
                careerStats = careerStats.find_all("table")
                parseCareerStats(logger, careerStats, player_profile_id)

            elif tabName == 'gamelogs':
                #Get the list of years
                gameLogYears = browser.find(id="criteria")
                gameLogYears = gameLogYears.find_all("option")
                yearsList = []
                for year in gameLogYears:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)

                #parse the first year of gameLogs since its already loaded
                gameLogs = browser.find(id="player-stats-wrapper")
                gameLogs = gameLogs.find_all("table")
                parseGameLogs(logger, gameLogs, yearsList[0], player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year
                    wait = random.uniform(1.5,3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open', playerUrl)
                    gameLogs = browser.find(id="player-stats-wrapper")
                    gameLogs = gameLogs.find_all("table")
                    parseGameLogs(logger, gameLogs, year, player_profile_id)

            elif tabName == 'gamesplits':
                #Get the list of years
                years = browser.find(id="criteria")
                years = years.find_all("option")
                yearsList = []
                for year in years:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)

                #parse the first year of gamesplits since its already loaded
                gameSplits = browser.find(id="player-stats-wrapper")
                parseSplits(logger, gameSplits, yearsList[0], 'game', player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year
                    wait = random.uniform(1.5,3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open', playerUrl)
                    gameSplits = browser.find(id="player-stats-wrapper")
                    parseSplits(logger, gameSplits, year, 'game', player_profile_id)

            elif tabName == 'situationalstats':
                #Get the list of years
                years = browser.find(id="criteria")
                years = years.find_all("option")
                yearsList = []
                for year in years:
                    year = year.text.strip()
                    if year:
                        yearsList.append(year)
                
                #parse the first year of gamesplits since its already loaded
                situationalStats = browser.find(id="player-stats-wrapper")
                parseSplits(logger, situationalStats, yearsList[0], 'situational', player_profile_id)

                #Parse the rest of the years
                for year in yearsList[1:]:
                    playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year
                    wait = random.uniform(1.5,3.5)
                    logger.debug('Waiting %f', wait)
                    time.sleep(wait)
                    logger.debug('Opening %s', playerUrl)
                    browser = open_or_follow_link(logger, browser, 'open', playerUrl)
                    situationalStats = browser.find(id="player-stats-wrapper")
                    parseSplits(logger, situationalStats, year, 'situational', player_profile_id)

            elif tabName == 'draft':
                draft = browser.find(id="player-stats-wrapper")
                parseDraft(logger, draft, player_profile_id)
            elif tabName == 'combine':
                combine = browser.find(id="player-stats-wrapper")
                parseCombine(logger, combine, player_profile_id)
    except:
        logger.exception('Failed parsing player')

    logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(playerId)
Esempio n. 21
0
def parseYear(year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(year, r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_schedule = db['schedule']
    col_game_info = db['game_info']
    col_failed_game_info = db['failed_game_info']

    if col_schedule.find({'year': year}).count():
        logger.debug('Already parsed %s', year)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://www.pro-football-reference.com/years/{}/games.htm".format(
            year))
    table = browser.find(id='games')
    rows = table.find_all('tr')
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            schedule_dict = {}
            gameInfo_dict = {}
            columns = row.find_all('td')
            if columns:
                schedule_dict['week'] = convertToNumber(columns[0].text)
                schedule_dict['day'] = columns[1].text
                schedule_dict['date'] = columns[2].text
                schedule_dict['year'] = convertToNumber(year)
                homeIndicator = columns[5].text
                if homeIndicator == '@':
                    schedule_dict['homeTeam'] = columns[6].text
                    schedule_dict['awayTeam'] = columns[4].text
                    schedule_dict['homeTeamScore'] = convertToNumber(
                        columns[8].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(
                        columns[7].text)
                else:
                    schedule_dict['homeTeam'] = columns[4].text
                    schedule_dict['awayTeam'] = columns[6].text
                    schedule_dict['homeTeamScore'] = convertToNumber(
                        columns[7].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(
                        columns[8].text)
                gameInfo_dict['week'] = convertToNumber(columns[0].text)
                gameInfo_dict['year'] = convertToNumber(year)
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                url = columns[3].find('a')
                if url:
                    url = 'http://www.pro-football-reference.com' + url['href']
                    failed_game_info = True
                    browser = open_or_follow_link(logger, browser, 'open', url)
                    game_info = browser.find(id="game_info")
                    if game_info:
                        for each in game_info.find_all('tr'):
                            pair = each.find_all('td')
                            if pair:
                                failed_game_info = False
                                key = pair[0].text
                                value = convertToNumber(pair[1].text)
                                gameInfo_dict[cleanKey(key)] = convertToNumber(
                                    value)
                    if failed_game_info:
                        failed_dict = schedule_dict
                        failed_dict['row'] = index
                        failed_dict['href'] = url['href']
                        col_failed_game_info.insert(failed_dict)
                        gameInfo_dict['FAIL'] = True

                schedule_list.append(schedule_dict)
                gameInfo_list.append(gameInfo_dict)
        except:
            logger.exception(row)

    logger.debug('nfl_schedule.inert_many')

    schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids

    logger.debug('mapping nfl_schedule.id to gameInfo_list')

    for index, schedule_id in enumerate(schedule_ids):
        if len(gameInfo_list[index].keys()) <= 2:
            logger.debug('Empty game_info: %s', schedule_id)
        gameInfo_list[index]['schedule_id'] = schedule_id

    logger.debug('game_info.insert_many')
    col_game_info.insert_many(gameInfo_list)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(year)
Esempio n. 22
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find('a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open', detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip()
                awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace('  ', ' ').strip()
                homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace('  ', ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())
                
                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[index-1].text.strip()
                        stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam}
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
Esempio n. 23
0
def parsePlayer(player_name, player_url):
    player_url = "http://www.pro-football-reference.com" + player_url

    logger = makeLogger(player_name, r'./logs_pfrPlayerStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_pfr_player_bio = db['pfr_player_bio']
    col_pfr_player_game_stats = db['pfr_player_game_stats']
    col_pfr_player_career_stats = db['pfr_player_career_stats']

    if col_pfr_player_bio.find({'player_url': player_url}).count():
        logger.debug('Player already parsed ' + player_url)
        return

    wait = random.uniform(1.5, 3)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening player page %s', player_url)
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(logger, browser, 'open',
                                  player_url + '/gamelog')

    logger.debug('Parsing player meta')
    meta_div = browser.find(id="meta")
    meta_items = None
    for div in meta_div.find_all('div'):
        try:
            if div['itemtype'] == 'http://schema.org/Person':
                meta_items = div.find_all('p')
        except KeyError:
            pass

    player_bio = {'player_url': player_url, 'player_name': player_name}
    for meta_item in meta_items:
        physical_stat_row = False
        item_spans = meta_item.find_all('span')
        for item_span in item_spans:
            try:
                if item_span['itemprop'] == 'height':
                    physical_stat_row = True
                    player_bio['height'] = item_span.text
                elif item_span['itemprop'] == 'weight':
                    physical_stat_row = True
                    player_bio['weight'] = item_span.text
            except KeyError:
                pass

        if physical_stat_row:
            continue

        key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text)
        for key, value in key_values:
            player_bio[cleanKey(key.replace(u'\xa0',
                                            u' '))] = value.strip().replace(
                                                u'\xa0', u' ')

    try:
        logger.debug('Creating player bio')
        player_bio_id = col_pfr_player_bio.insert(player_bio)
    except:
        logger.exception('insert error')
        return

    try:
        regular_season_div = browser.find(id='all_stats')
        regular_season_table = regular_season_div.find(
            class_="table_outer_container").find(id="div_stats")
    except AttributeError:
        logger.debug('No game logs, exiting player')
        return

    career_total_dicts = []
    try:
        game_stat_dicts, career_total_dict = parseTable(
            logger, player_bio_id, regular_season_table, 'regular season')
        career_total_dicts.append(career_total_dict)
    except:
        logger.exception('parseTable error. Deleting user bio and exiting')
        col_pfr_player_bio.remove({'player_url': player_url})
        return

    playoff_table = browser.find(id="stats_playoffs")
    if not playoff_table:
        logger.debug('No playoff game logs')
    else:
        try:
            temp_game_dicts, career_total_dict = parseTable(
                logger, player_bio_id, playoff_table, 'playoffs')
            game_stat_dicts += temp_game_dicts
            career_total_dicts.append(career_total_dict)
        except:
            logger.exception('parseTable error. Deleting user bio and exiting')
            col_pfr_player_bio.remove({'player_url': player_url})
            return

    try:
        logger.debug('Bulk Creating game_stat_dicts')
        if game_stat_dicts:
            col_pfr_player_game_stats.insert_many(game_stat_dicts)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    try:
        logger.debug('Bulk Creating career_total_dicts')
        if career_total_dict:
            col_pfr_player_career_stats.insert_many(career_total_dicts)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parsePlayer time elapsed: ' +
                 str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 24
0
def parseYear(team_name, year_url, year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(cleanKey(team_name) + "_" + str(year), r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("Starting %d", year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient("localhost", 27017)
    db = client["nfl_data"]
    col_team_stats_weekly = db["team_stats_weekly"]

    # need to fix this to actually detect duplicate
    # if col_team_stats_weekly.find({'year': year}).count():
    #     logger.debug('Already parsed %s', year)
    #     closeLogger(logger)
    #     return None

    wait = random.uniform(1.5, 3.5)
    logger.debug("Waiting %f", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", year_url)
    table = browser.find(id="games")
    rows = table.find_all("tr")
    header = [cleanKey(each.attrs["data-stat"]) for each in rows[0].find_all("th")]
    rows = rows[1:]

    row_dicts = []
    for index, row in enumerate(rows):
        logger.debug("Row %d of %d", index, len(rows))
        try:
            week_number = convertToNumber(row.find("th").text)
            row_values = [convertToNumber(value.text) for value in row.find_all("td")]
            row_values.insert(0, week_number)
            row_dict = dict(zip(header, row_values))
            row_dict["year"] = year
            row_dict["team_name"] = team_name
            row_dict["year_url"] = year_url

            if row_dict["game_date"].lower() == "playoffs":
                continue

            row_dicts.append(row_dict)
        except:
            logger.exception(row)

    logger.debug("team_stats_weekly.inert_many")

    col_team_stats_weekly.insert_many(row_dicts)

    logger.debug("parseYear time elapsed: " + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 25
0
def parseSeason(role, category, season, seasonTypes):
    """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season
    doesnt follow any links
    some years dont have any info, but still return a page.
    These are loged with Exception('No teams found %s' % url)
    All data is stored in team_stats
    """
    logger = makeLogger(role.text + '_' + category.text + '_' + season.text,
                        r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('Starting %s %s %s', role.text, category.text, season.text)

    teamStat_list = []
    for seasonType in seasonTypes:
        if seasonType.text == "Season Type...":
            continue

        team_stats_query = {
            'year': convertToNumber(removeNewLine(season.text)),
            'seasonType': removeNewLine(seasonType.text),
            'role': removeNewLine(role.text),
            'category': removeNewLine(category.text)
        }

        if col_team_stats.find(team_stats_query).count():
            logger.debug('Already parsed %s', team_stats_query)
            continue

        wait = random.uniform(1.5, 3.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Starting: %s', team_stats_query)
        url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role[
            'value']
        try:
            if role.text == "Offense":
                categoryUrl = '&offensiveStatisticCategory=' + category[
                    'value'] + '&defensiveStatisticCategory=null'

            elif role.text == "Defense":
                categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category[
                    'value']
            else:
                raise Exception('Unsupported role: %s', role.text)

            url += categoryUrl
            url += '&season=' + season['value'] + '&seasonType=' + seasonType[
                'value'] + '&tabSeq=2&qualified=false&Submit=Go'

            logger.debug('Opening: %s', url)
            browser = RoboBrowser(history=False,
                                  parser='html5lib',
                                  user_agent=get_user_agent(logger),
                                  timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

            tries = 0
            # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
            while not result:
                if tries > 10:
                    raise Exception('No teams found %s' % url)
                elif tries > 0:
                    time.sleep(random.uniform(5, 7))
                tries += 1
                logger.debug('No result-tries: %d', tries)
                browser = RoboBrowser(history=False,
                                      parser='html5lib',
                                      user_agent=get_user_agent(logger),
                                      timeout=10)
                browser = open_or_follow_link(logger, browser, 'open', url)
                result = browser.find(id="result")

            tbodies = result.find_all("tbody")
            if len(tbodies) != 2:
                raise Exception("error parsing result")
            tableKey = tbodies[0]
            tableKeyRows = tableKey.find_all("tr")
            topTableKeys = []
            if len(tableKeyRows) == 1:
                tableKey = tableKey.find_all("th")
            elif len(tableKeyRows) == 2:
                topTableColumns = tableKeyRows[0].find_all("th")
                for topTableColumn in topTableColumns:
                    for _ in range(int(topTableColumn['colspan'])):
                        topTableKeys.append(topTableColumn.text)
                tableKey = tableKeyRows[1].find_all("th")
            else:
                raise Exception('To many header rows found')

            tableItems = tbodies[1]
            tableItems = tableItems.find_all("td")

            tableColumn = 0
            teamStatDict = {}
            for tableIndex, tableItem in enumerate(tableItems):
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    teamStatDict['team'] = removeNewLine(tableItem.text)
                    teamStatDict['year'] = int(removeNewLine(season.text))
                    teamStatDict['seasonType'] = removeNewLine(seasonType.text)
                    teamStatDict['role'] = removeNewLine(role.text)
                    teamStatDict['category'] = removeNewLine(category.text)
                    tableColumn += 1
                    continue

                if topTableKeys and topTableKeys[tableColumn]:
                    key = topTableKeys[tableColumn] + '_' + tableKey[
                        tableColumn].text
                else:
                    key = tableKey[tableColumn].text
                key = cleanKey(removeNewLine(key))
                value = convertToNumber(removeNewLine(tableItem.text))
                teamStatDict[key] = value

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    teamStat_list.append(teamStatDict)
                    teamStatDict = {}
                    tableColumn = 0
        except:
            logger.exception('row fail')

    try:
        if teamStat_list:
            logger.debug('Bulk Creating teamStat_list')
            col_team_stats.insert_many(teamStat_list)
    except:
        logger.exception('insert_many error')

    logger.debug('parseSeason time elapsed: ' +
                 str(datetime.now() - startTime))

    closeLogger(role.text + '_' + category.text)
Esempio n. 26
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find(
                    'a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open',
                                              detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(
                    ':', 1)[1].strip()
                awayTeam = browser.find_all(
                    class_='g-away')[1].find('a').text.replace('  ',
                                                               ' ').strip()
                homeTeam = browser.find_all(
                    class_='g-home')[1].find('a').text.replace('  ',
                                                               ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(
                            split[0].strip())] = convertToNumber(
                                split[1].strip())

                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[
                                index - 1].text.strip()
                        stadiumInfo[cleanKey(
                            split[0].strip())] = convertToNumber(
                                split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {
                    'year': year,
                    'week': week,
                    'homeTeam': homeTeam,
                    'awayTeam': awayTeam
                }
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(
                        weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(
                    schedule_query, {'$set': {
                        'dateTime': gameTime
                    }})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))