def get_schedule_data(): FILE = util.get_schedule_in_fname() OUTPUT_FILE = util.get_schedule_out_fname() if os.path.exists(OUTPUT_FILE): return json.loads(open(OUTPUT_FILE, 'r').read()) workbook = xlrd.open_workbook(FILE) worksheet = workbook.sheet_by_index(0) parsed_data = {} #print worksheet.row(0)[0].value headers = [cell.value for cell in worksheet.row(0)] #print(worksheet.nrows) for c in range(1, worksheet.nrows): row = worksheet.row(c) raw_date = row[0].value cleaned_date = raw_date[3:] raw_month, raw_day = cleaned_date.strip().split('/') month, day = int(raw_month), int(raw_day) if month > 8: season = SEASON_START else: season = SEASON_END #cleaned_date = datetime.strptime(cleaned_date+'/'+str(season), "%m/%d/%Y") cleaned_date = cleaned_date+'/'+str(season) games = [] # ignore last column, its the date column repeated for c in range(1, len(row)-1): # games are doubled, so only look at the ones where the cell values are away if '@' in row[c].value: raw_home = headers[c] raw_away = row[c].value.strip().lstrip('@ ') home = sanitizer.sanitize_team_name(raw_home.upper()) away = sanitizer.sanitize_team_name(raw_away.upper()) entry = {"home": home, "away": away, "date": cleaned_date} games.append(entry) parsed_data[cleaned_date] = games f = open(OUTPUT_FILE, 'w+') f.write(json.dumps(parsed_data, sort_keys=True, indent=4)) f.close() return parsed_data
def scrape_depth_chart(): req = urllib2.Request(URL) response = urllib2.urlopen(req) the_page = response.read() soup = BeautifulSoup(the_page, 'html.parser') depth_chart_html = soup.find(id="cp1_tblDepthCharts") # th tags correspond to team names, the text of the team names are found inside the first # url of the th tag th = depth_chart_html.find_all("th") team_names = [sanitizer.sanitize_team_name(namelink.find('a').text) for namelink in th] # get tables corresponding to teams tables = depth_chart_html.find_all("table") team_depth_charts = {} for c in range(len(tables)): table = tables[c] positions = {} injured = [] table_rows = table.find_all('tr') curr_position = None curr_position_players = [] for row in table_rows: tds = row.find_all('td') if tds[0].text != '': next_position = row.find_all('td')[0].text if next_position != '': if curr_position and row.find_all('td')[0].text != curr_position: positions[curr_position] = curr_position_players curr_position_players = [] curr_position = next_position player_name = tds[1].find('a').text curr_position_players.append(player_name) # website provides an image tag that tells you if player is sidelined/out is_sidelined = tds[1].find('img') != None if is_sidelined: injured.append(player_name) # need to fill up one last time, or else nonempty curr_position_players array data will get tossed positions[curr_position] = curr_position_players team_depth_charts[team_names[c]] = {'injured': injured, 'positions': positions} #print(positions) #print(injured) return team_depth_charts