コード例 #1
0
def get_schedule_data():
	FILE = util.get_schedule_in_fname()
	OUTPUT_FILE = util.get_schedule_out_fname()

	if os.path.exists(OUTPUT_FILE):
		return json.loads(open(OUTPUT_FILE, 'r').read())

	workbook = xlrd.open_workbook(FILE)
	worksheet = workbook.sheet_by_index(0)

	parsed_data = {}

	#print worksheet.row(0)[0].value
	headers = [cell.value for cell in worksheet.row(0)]
	#print(worksheet.nrows)
	for c in range(1, worksheet.nrows):
		row = worksheet.row(c)
		raw_date = row[0].value
		cleaned_date = raw_date[3:]
		raw_month, raw_day = cleaned_date.strip().split('/')
		month, day = int(raw_month), int(raw_day)

		if month > 8:
			season = SEASON_START
		else:
			season = SEASON_END
		#cleaned_date = datetime.strptime(cleaned_date+'/'+str(season), "%m/%d/%Y")
		cleaned_date = cleaned_date+'/'+str(season)

		games = []

		# ignore last column, its the date column repeated
		for c in range(1, len(row)-1):
			# games are doubled, so only look at the ones where the cell values are away
			if '@' in row[c].value:
				raw_home = headers[c]
				raw_away = row[c].value.strip().lstrip('@ ')
				home = sanitizer.sanitize_team_name(raw_home.upper())
				away = sanitizer.sanitize_team_name(raw_away.upper())

				entry = {"home": home, "away": away, "date": cleaned_date}
				games.append(entry)

		parsed_data[cleaned_date] = games
	f = open(OUTPUT_FILE, 'w+')
	f.write(json.dumps(parsed_data, sort_keys=True, indent=4))
	f.close()
	return parsed_data
コード例 #2
0
def scrape_depth_chart():
	req = urllib2.Request(URL)
	response = urllib2.urlopen(req)
	the_page = response.read()

	soup = BeautifulSoup(the_page, 'html.parser')

	depth_chart_html = soup.find(id="cp1_tblDepthCharts")

	# th tags correspond to team names, the text of the team names are found inside the first
	# url of the th tag
	th = depth_chart_html.find_all("th")
	team_names = [sanitizer.sanitize_team_name(namelink.find('a').text) for namelink in th]
	# get tables corresponding to teams
	tables = depth_chart_html.find_all("table")

	team_depth_charts = {}

	for c in range(len(tables)):
		table = tables[c]
		positions = {}
		injured = []

		table_rows = table.find_all('tr')

		curr_position = None
		curr_position_players = []
		for row in table_rows:
			tds = row.find_all('td')
			if tds[0].text != '':
				next_position = row.find_all('td')[0].text
				if next_position != '':
					if curr_position and row.find_all('td')[0].text != curr_position:
						positions[curr_position] = curr_position_players
						curr_position_players = []
					curr_position = next_position
			player_name = tds[1].find('a').text
			curr_position_players.append(player_name)

			# website provides an image tag that tells you if player is sidelined/out
			is_sidelined = tds[1].find('img') != None
			if is_sidelined:
				injured.append(player_name)

		# need to fill up one last time, or else nonempty curr_position_players array data will get tossed
		positions[curr_position] = curr_position_players

		team_depth_charts[team_names[c]] = {'injured': injured, 'positions': positions}
		#print(positions)
		#print(injured)

	return team_depth_charts