def parse_availability_html(self, campusparking_avail_html): results = [] lot_spots = None try: campus_lot_soup = BeautifulSoup(campusparking_avail_html) # get all children of the availability div whose class name starts with dataRow lot_rows = campus_lot_soup.find( 'table', { 'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName' }).findAll('tr') # loop table rows, starting with 2bd row (excludes header row) for row_index in range(1, len(lot_rows)): # grab the array of cells in the current row table_cells = lot_rows[row_index].findAll('td') short_name = table_cells[1].string.split(' ')[0].strip() spots_cell = table_cells[2].string.strip() if spots_cell is not None and spots_cell.isdigit(): lot_spots = spots_cell lot_details = { 'shortName': short_name, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from campus parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from campus parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from campus parking page.') raise TypeError except IndexError: # Html is probably None logging.error( 'IndexError parsing scraped content from campus parking page.') raise IndexError return results
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) # special_event_rows is array of <tr>'s. special_event_rows = soup.find('table', { 'id': 'calendar' }).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # table_cells is array in the current row table_cells = special_event_rows[row_index].findAll('td') parking_location = table_cells[1].string event_venue = table_cells[4].string event = table_cells[3].string event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes( table_cells) # add this special event info to the specialEvents collection special_events['specialEvents'].append({ 'parkingLocation': parking_location, 'eventVenue': event_venue, 'eventDatetime': event_time, 'eventName': event, 'parkingStartDatetime': parking_start_time, 'parkingEndDatetime': parking_end_time, 'webUrl': self.parking_data['special_events_url'] }) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error( 'Error parsing scraped content from city special events page.' + str(e)) special_events['specialEvents'] = [] return special_events
def parse_availability_html(self, campusparking_avail_html): results = [] lot_spots = None try: campus_lot_soup = BeautifulSoup(campusparking_avail_html) # get all children of the availability div whose class name starts with dataRow lot_rows = campus_lot_soup.find("table", {"id": "ctl00_ctl00_central_block_right_navi_cnt_gvName"}).findAll( "tr" ) # loop table rows, starting with 2bd row (excludes header row) for row_index in range(1, len(lot_rows)): # grab the array of cells in the current row table_cells = lot_rows[row_index].findAll("td") short_name = table_cells[1].string.split(" ")[0].strip() spots_cell = table_cells[2].string.strip() if spots_cell is not None and spots_cell.isdigit(): lot_spots = spots_cell lot_details = {"shortName": short_name, "openSpots": int(lot_spots)} results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error("ValueError parsing scraped content from campus parking page.") raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error("AttributeError parsing scraped content from campus parking page.") raise AttributeError except TypeError: # Html is probably None logging.error("TypeError parsing scraped content from campus parking page.") raise TypeError except IndexError: # Html is probably None logging.error("IndexError parsing scraped content from campus parking page.") raise IndexError return results
def parse_availability_html(self, availability_html): results = [] lot_spots = None try: city_lot_soup = BeautifulSoup(availability_html) # get all children of the availability div whose class name starts with dataRow lot_rows = city_lot_soup.find('div', {'id': 'availability'})\ .findAll('div', {'class': re.compile('^dataRow')}) if not lot_rows: # if we find no rows, we're dead raise ValueError for row in lot_rows: for detail in row: if detail.string is not None and detail.string.isdigit(): lot_spots = detail.string lot_details = { 'name': row.div.a.string, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from city parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from city parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from city parking page.') raise TypeError return results
def parse_availability_html(self, availability_html): results = [] lot_spots = None try: city_lot_soup = BeautifulSoup(availability_html) # get all children of the availability div whose class name starts with dataRow lot_rows = city_lot_soup.find('div', {'id': 'availability'})\ .findAll('div', {'class': re.compile('^dataRow')}) if not lot_rows: # if we find no rows, we're dead raise ValueError for row in lot_rows: for detail in row: if detail.string is not None and detail.string.isdigit(): lot_spots = detail.string lot_details = { 'name': row.div.a.string, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error('ValueError parsing scraped content from city parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error('AttributeError parsing scraped content from city parking page.') raise AttributeError except TypeError: # Html is probably None logging.error('TypeError parsing scraped content from city parking page.') raise TypeError return results
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) # special_event_rows is array of <tr>'s. special_event_rows = soup.find('table', {'id': 'calendar'}).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # table_cells is array in the current row table_cells = special_event_rows[row_index].findAll('td') parking_location = table_cells[1].string event_venue = table_cells[4].string event = table_cells[3].string event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(table_cells) # add this special event info to the specialEvents collection special_events['specialEvents'].append( { 'parkingLocation': parking_location, 'eventVenue': event_venue, 'eventDatetime': event_time, 'eventName': event, 'parkingStartDatetime': parking_start_time, 'parkingEndDatetime': parking_end_time, 'webUrl': self.parking_data['special_events_url'] } ) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error('Error parsing scraped content from city special events page.' + str(e)) special_events['specialEvents'] = [] return special_events
def getParkingSpecialEvents(): loop = 0 done = False result = None specialeventsurl = 'http://www.cityofmadison.com/parkingUtility/calendar/index.cfm' cachehours = 24 #initialize the dict to hold result of scrape. specialevents = dict() specialevents['CacheUntil'] = datetime.datetime.strftime(api_utils.getLocalDatetime() + datetime.timedelta(hours=+cachehours), '%Y-%m-%dT%H:%M:%S') logging.info(specialevents['CacheUntil']) specialevents['ParkingSpecialEvents'] = [] specialevents['LastScraped'] = datetime.datetime.strftime(api_utils.getLocalDatetime(), '%Y-%m-%dT%H:%M:%S') # Looping in case fetch flaky. while not done and loop < 3: try: #grab the city parking html page - what an awesome API!!! :( result = urlfetch.fetch(specialeventsurl) #invoke soup to parse html soup = BeautifulSoup(result.content) # find the calendar table containing special event info. # returns array of <tr>'s. special_event_rows = soup.find("table", { "id" : "calendar" }).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # grab the array of cells in the current row table_cells = special_event_rows[row_index].findAll('td') parkinglocation = table_cells[1].string eventvenue = table_cells[4].string event = table_cells[3].string # take the event time strings (already central time), create datetime obj, then convert back to correct string eventtimeobj = datetime.datetime.strptime(table_cells[0].string + api_utils.get_time_from_text(table_cells[5].string) .replace(' ',''), '%m/%d/%Y%I:%M%p') eventtime = datetime.datetime.strftime(eventtimeobj, '%Y-%m-%dT%H:%M:%S') # split '00:00 pm - 00:00 pm' into start and end strings timeparts = table_cells[2].string.split(' - ') # clean up whitespace to avoid errors due to inconsistent format timeparts[0] = timeparts[0].replace(' ', '') timeparts[1] = timeparts[1].replace(' ', '') parkingstarttimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[0], '%m/%d/%Y%I:%M%p') parkingstarttime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S') parkingendtimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[1], '%m/%d/%Y%I:%M%p') parkingendtime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S') # add this special event info to the ParkingSpecialEvents collection specialevents['ParkingSpecialEvents'].append({"ParkingLocation":parkinglocation, "EventVenue":eventvenue, "EventTime":eventtime, "Event":event, "ParkingStartTime":parkingstarttime, "ParkingEndTime":parkingendtime}) # setting content var to keep contract with caller exactly in-tact (for now). result.content = json.dumps(specialevents) done = True; # problem hiting url, try a few times except urlfetch.DownloadError: logging.error("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(6) loop = loop+1 # This is bad. Some data may be in a differnt format due to # either unexpected data entry or *gulp* site redeisgn. # Likely require code change to fix. except ValueError: logging.error("Error parsing scraped content from (%s)... exiting getParkingSpecialEvents()" % specialeventsurl) done = True result = None return result