Esempio n. 1
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger('main', r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    minyear = 1960
    maxyear = 2015

    pool = Pool(processes=int(get_proxy_count() / 2))

    for i in range(maxyear - minyear + 1):
        year = minyear + i
        #parseYear(year)
        pool.apply_async(parseYear, (year, ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 2
0
def run(wait=0):
    """Starts the scrapping proccess.
    creates a process per week per year given in pages
    """

    logger = makeLogger('main', r'./logs_RotoFDStats/')
    
    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    logger.debug('starting')
    pool = Pool(processes=int(get_proxy_count()/2))

    pages = [(2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)]
        
    for year, maxWeek in pages:
        for week in range(1, maxWeek+1):
            #parseWeek(year, week)
            pool.apply_async(parseWeek,(year, week,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 3
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process for each week per year given in pages
    """

    logger = makeLogger('main', r'./logs_nflWeather/')
    
    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count()/2.5))

    #nflweather.com goes back to 2009, 2010 seems to be missing on the site.
    pages = [(2009, 17), (2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)]

    headers = []
    dataList = []
    for year, maxWeek in pages:
        for week in range(1, maxWeek+1):
            #parseWeek(year, week)
            pool.apply_async(parseWeek, (year, week,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 4
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger('main', r'./logs_pfrSchedule/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    minyear = 1960
    maxyear = 2015

    pool = Pool(processes=int(get_proxy_count()/2))

    for i in range(maxyear-minyear+1):
        year = minyear + i
        #parseYear(year)
        pool.apply_async(parseYear, (year,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 5
0
def run(wait=0):
    """Starts the scrapping proccess.
    creates a process per week per year given in pages
    """

    logger = makeLogger('main', r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    logger.debug('starting')
    pool = Pool(processes=int(get_proxy_count() / 2))

    pages = [(2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)]

    for year, maxWeek in pages:
        for week in range(1, maxWeek + 1):
            #parseWeek(year, week)
            pool.apply_async(parseWeek, (
                year,
                week,
            ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 6
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count()/2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)
    
    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    
    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (role, category, season, seasonTypes,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 7
0
def run(wait):
    """
    """
    logger = makeLogger('main', r'./logs_pfrPlayerStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))

    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)

    player_tuples = []
    for letter in list(string.ascii_uppercase):
        wait = random.uniform(.5, 1.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Opening players %s', letter)
        browser = open_or_follow_link(
            logger, browser, 'open',
            "http://www.pro-football-reference.com/players/{}/".format(letter))
        players = browser.find(id="div_players")

        for player in players.find_all('p'):
            player = player.find('a')
            player_tuples.append((player.text, player['href']))

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    logger.debug('Processing %d Players', len(player_tuples))
    for player_tuple in player_tuples:
        #parsePlayer(player_tuple[0], player_tuple[1])
        pool.apply_async(parsePlayer, (
            player_tuple[0],
            player_tuple[1],
        ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
def run(wait):
    """
    """
    logger = makeLogger('main', r'./logs_pfrPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
     
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    
    player_tuples = []
    for letter in list(string.ascii_uppercase):
        wait = random.uniform(.5,1.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Opening players %s', letter)
        browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter))
        players = browser.find(id="div_players")

        for player in players.find_all('p'):
            player = player.find('a')
            player_tuples.append((player.text, player['href']))

    pool = Pool(processes=int(get_proxy_count()/2.5))

    logger.debug('Processing %d Players', len(player_tuples))
    for player_tuple in player_tuples:
        #parsePlayer(player_tuple[0], player_tuple[1])
        pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],))


    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 9
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process for each week per year given in pages
    """

    logger = makeLogger('main', r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    #nflweather.com goes back to 2009, 2010 seems to be missing on the site.
    pages = [(2009, 17), (2011, 17), (2012, 17), (2013, 17), (2014, 17),
             (2015, 17)]

    headers = []
    dataList = []
    for year, maxWeek in pages:
        for week in range(1, maxWeek + 1):
            #parseWeek(year, week)
            pool.apply_async(parseWeek, (
                year,
                week,
            ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 10
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(
                        removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (
                    role,
                    category,
                    season,
                    seasonTypes,
                ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 11
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger('main', r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))

    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://www.pro-football-reference.com/teams/")
    table_body = browser.find(id='teams_active').find('tbody')
    rows = table_body.find_all('tr')

    team_url_tups = []

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            team_link = row.find('th').find('a')
            if team_link:
                team_url = 'http://www.pro-football-reference.com' + team_link[
                    'href']
                team_name = team_link.text
                team_url_tups.append((team_url, team_name))
        except:
            logger.exception(row)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    for team_url, team_name in team_url_tups:
        #print parseTeam(team_url, team_name)
        results.append(pool.apply_async(parseTeam, (
            team_url,
            team_name,
        )))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    year_url_tups = []
    for result in results:
        year_url_tup = result.get()
        if year_url_tup:
            year_url_tups += (year_url_tup)

    logger.debug('Done gathering %d year urls', len(year_url_tups))

    pool = Pool(processes=int(get_proxy_count() / 2))

    logger.debug('Shuffling year_urls')
    random.shuffle(year_url_tups)
    logger.debug('Starting to parse year_urls')
    for team_name, year_url, year in year_url_tups:
        #parseYear(team_name, year_url, year)
        pool.apply_async(parseYear, (
            team_name,
            year_url,
            year,
        ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 12
0
def run(wait):
    """
    First collects a set of playerUrls to parse using, parsePlayerNames.
    Then parses each player.
    Both tasks use multiprocessing
    """

    logger = makeLogger('main', r'./logs_nflPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)
    
    pool = Pool(processes=int(get_proxy_count()/2.5))
    results = []

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE'
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    statisticCategory = browser.find(id="statistic-category")
    statisticCategories = statisticCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for statisticCategory in statisticCategories:
        if statisticCategory.text == 'Category...':
            continue
        for season in seasons:
            if season.text == 'Season...':
                continue
            for seasonType in seasonTypes:
                if seasonType.text == 'Season Type...':
                    continue
                results.append(pool.apply_async(parsePlayerNames, (statisticCategory['value'], season['value'], seasonType['value'],)))
    
    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    playerUrl_set = set()
    for result in results:
        try:
            result_set = result.get()
            if result_set:
                playerUrl_set = playerUrl_set.union(result_set)
        except:
            logger.exception('Error in parsePlayerNames worker')

    with open('../playerUrl_set.json', 'w') as playerUrl_json:
        playerUrl_json.write(json.dumps(list(playerUrl_set)))

    pool = Pool(processes=int(get_proxy_count()/2.5))

    logger.debug('Starting to parse %d players', len(playerUrl_set))
    for playerUrl in playerUrl_set:
        if col_player_profiles.find({'player_url': playerUrl}).count():
            logger.debug('Skipping ' + playerUrl)
            continue
        #parsePlayer(playerUrl)
        pool.apply_async(parsePlayer, (playerUrl,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

        
    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 13
0
def run(wait):
    """Starts the scrapping proccess.
    creates a process per year between minyear and maxyear
    """

    logger = makeLogger("main", r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("start time: " + str(startTime))

    logger.debug("waiting %d seconds", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", "http://www.pro-football-reference.com/teams/")
    table_body = browser.find(id="teams_active").find("tbody")
    rows = table_body.find_all("tr")

    team_url_tups = []

    for index, row in enumerate(rows):
        logger.debug("Row %d of %d", index, len(rows))
        try:
            team_link = row.find("th").find("a")
            if team_link:
                team_url = "http://www.pro-football-reference.com" + team_link["href"]
                team_name = team_link.text
                team_url_tups.append((team_url, team_name))
        except:
            logger.exception(row)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    for team_url, team_name in team_url_tups:
        # print parseTeam(team_url, team_name)
        results.append(pool.apply_async(parseTeam, (team_url, team_name)))

    pool.close()  # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join()  # Wait for the worker processes to exit. One must call close() or terminate() before using join().

    year_url_tups = []
    for result in results:
        year_url_tup = result.get()
        if year_url_tup:
            year_url_tups += year_url_tup

    logger.debug("Done gathering %d year urls", len(year_url_tups))

    pool = Pool(processes=int(get_proxy_count() / 2))

    logger.debug("Shuffling year_urls")
    random.shuffle(year_url_tups)
    logger.debug("Starting to parse year_urls")
    for team_name, year_url, year in year_url_tups:
        # parseYear(team_name, year_url, year)
        pool.apply_async(parseYear, (team_name, year_url, year))

    pool.close()  # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join()  # Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug("run time: " + str(datetime.now() - startTime))

    closeLogger("main")
Esempio n. 14
0
def run(wait):
    """
    First collects a set of playerUrls to parse using, parsePlayerNames.
    Then parses each player.
    Both tasks use multiprocessing
    """

    logger = makeLogger('main', r'./logs_nflPlayerStats/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))
    results = []

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE'
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    statisticCategory = browser.find(id="statistic-category")
    statisticCategories = statisticCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for statisticCategory in statisticCategories:
        if statisticCategory.text == 'Category...':
            continue
        for season in seasons:
            if season.text == 'Season...':
                continue
            for seasonType in seasonTypes:
                if seasonType.text == 'Season Type...':
                    continue
                results.append(
                    pool.apply_async(parsePlayerNames, (
                        statisticCategory['value'],
                        season['value'],
                        seasonType['value'],
                    )))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    playerUrl_set = set()
    for result in results:
        try:
            result_set = result.get()
            if result_set:
                playerUrl_set = playerUrl_set.union(result_set)
        except:
            logger.exception('Error in parsePlayerNames worker')

    with open('../playerUrl_set.json', 'w') as playerUrl_json:
        playerUrl_json.write(json.dumps(list(playerUrl_set)))

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    logger.debug('Starting to parse %d players', len(playerUrl_set))
    for playerUrl in playerUrl_set:
        if col_player_profiles.find({'player_url': playerUrl}).count():
            logger.debug('Skipping ' + playerUrl)
            continue
        #parsePlayer(playerUrl)
        pool.apply_async(parsePlayer, (playerUrl, ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')