def parse_availability_html(self, campusparking_avail_html): results = [] lot_spots = None try: campus_lot_soup = BeautifulSoup(campusparking_avail_html) # get all children of the availability div whose class name starts with dataRow lot_rows = campus_lot_soup.find( 'table', { 'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName' }).findAll('tr') # loop table rows, starting with 2bd row (excludes header row) for row_index in range(1, len(lot_rows)): # grab the array of cells in the current row table_cells = lot_rows[row_index].findAll('td') short_name = table_cells[1].string.split(' ')[0].strip() spots_cell = table_cells[2].string.strip() if spots_cell is not None and spots_cell.isdigit(): lot_spots = spots_cell lot_details = { 'shortName': short_name, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from campus parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from campus parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from campus parking page.') raise TypeError except IndexError: # Html is probably None logging.error( 'IndexError parsing scraped content from campus parking page.') raise IndexError return results
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) # special_event_rows is array of <tr>'s. special_event_rows = soup.find('table', { 'id': 'calendar' }).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # table_cells is array in the current row table_cells = special_event_rows[row_index].findAll('td') parking_location = table_cells[1].string event_venue = table_cells[4].string event = table_cells[3].string event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes( table_cells) # add this special event info to the specialEvents collection special_events['specialEvents'].append({ 'parkingLocation': parking_location, 'eventVenue': event_venue, 'eventDatetime': event_time, 'eventName': event, 'parkingStartDatetime': parking_start_time, 'parkingEndDatetime': parking_end_time, 'webUrl': self.parking_data['special_events_url'] }) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error( 'Error parsing scraped content from city special events page.' + str(e)) special_events['specialEvents'] = [] return special_events
def parse_availability_html(self, campusparking_avail_html): results = [] lot_spots = None try: campus_lot_soup = BeautifulSoup(campusparking_avail_html) # get all children of the availability div whose class name starts with dataRow lot_rows = campus_lot_soup.find("table", {"id": "ctl00_ctl00_central_block_right_navi_cnt_gvName"}).findAll( "tr" ) # loop table rows, starting with 2bd row (excludes header row) for row_index in range(1, len(lot_rows)): # grab the array of cells in the current row table_cells = lot_rows[row_index].findAll("td") short_name = table_cells[1].string.split(" ")[0].strip() spots_cell = table_cells[2].string.strip() if spots_cell is not None and spots_cell.isdigit(): lot_spots = spots_cell lot_details = {"shortName": short_name, "openSpots": int(lot_spots)} results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error("ValueError parsing scraped content from campus parking page.") raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error("AttributeError parsing scraped content from campus parking page.") raise AttributeError except TypeError: # Html is probably None logging.error("TypeError parsing scraped content from campus parking page.") raise TypeError except IndexError: # Html is probably None logging.error("IndexError parsing scraped content from campus parking page.") raise IndexError return results
def parse_availability_html(self, availability_html): results = [] lot_spots = None try: city_lot_soup = BeautifulSoup(availability_html) # get all children of the availability div whose class name starts with dataRow lot_rows = city_lot_soup.find('div', {'id': 'availability'})\ .findAll('div', {'class': re.compile('^dataRow')}) if not lot_rows: # if we find no rows, we're dead raise ValueError for row in lot_rows: for detail in row: if detail.string is not None and detail.string.isdigit(): lot_spots = detail.string lot_details = { 'name': row.div.a.string, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from city parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from city parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from city parking page.') raise TypeError return results
def parse_availability_html(self, availability_html): results = [] lot_spots = None try: city_lot_soup = BeautifulSoup(availability_html) # get all children of the availability div whose class name starts with dataRow lot_rows = city_lot_soup.find('div', {'id': 'availability'})\ .findAll('div', {'class': re.compile('^dataRow')}) if not lot_rows: # if we find no rows, we're dead raise ValueError for row in lot_rows: for detail in row: if detail.string is not None and detail.string.isdigit(): lot_spots = detail.string lot_details = { 'name': row.div.a.string, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error('ValueError parsing scraped content from city parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error('AttributeError parsing scraped content from city parking page.') raise AttributeError except TypeError: # Html is probably None logging.error('TypeError parsing scraped content from city parking page.') raise TypeError return results
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) # special_event_rows is array of <tr>'s. special_event_rows = soup.find('table', {'id': 'calendar'}).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # table_cells is array in the current row table_cells = special_event_rows[row_index].findAll('td') parking_location = table_cells[1].string event_venue = table_cells[4].string event = table_cells[3].string event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(table_cells) # add this special event info to the specialEvents collection special_events['specialEvents'].append( { 'parkingLocation': parking_location, 'eventVenue': event_venue, 'eventDatetime': event_time, 'eventName': event, 'parkingStartDatetime': parking_start_time, 'parkingEndDatetime': parking_end_time, 'webUrl': self.parking_data['special_events_url'] } ) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error('Error parsing scraped content from city special events page.' + str(e)) special_events['specialEvents'] = [] return special_events
def getParkingSpecialEvents(): loop = 0 done = False result = None specialeventsurl = 'http://www.cityofmadison.com/parkingUtility/calendar/index.cfm' cachehours = 24 #initialize the dict to hold result of scrape. specialevents = dict() specialevents['CacheUntil'] = datetime.datetime.strftime(api_utils.getLocalDatetime() + datetime.timedelta(hours=+cachehours), '%Y-%m-%dT%H:%M:%S') logging.info(specialevents['CacheUntil']) specialevents['ParkingSpecialEvents'] = [] specialevents['LastScraped'] = datetime.datetime.strftime(api_utils.getLocalDatetime(), '%Y-%m-%dT%H:%M:%S') # Looping in case fetch flaky. while not done and loop < 3: try: #grab the city parking html page - what an awesome API!!! :( result = urlfetch.fetch(specialeventsurl) #invoke soup to parse html soup = BeautifulSoup(result.content) # find the calendar table containing special event info. # returns array of <tr>'s. special_event_rows = soup.find("table", { "id" : "calendar" }).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # grab the array of cells in the current row table_cells = special_event_rows[row_index].findAll('td') parkinglocation = table_cells[1].string eventvenue = table_cells[4].string event = table_cells[3].string # take the event time strings (already central time), create datetime obj, then convert back to correct string eventtimeobj = datetime.datetime.strptime(table_cells[0].string + api_utils.get_time_from_text(table_cells[5].string) .replace(' ',''), '%m/%d/%Y%I:%M%p') eventtime = datetime.datetime.strftime(eventtimeobj, '%Y-%m-%dT%H:%M:%S') # split '00:00 pm - 00:00 pm' into start and end strings timeparts = table_cells[2].string.split(' - ') # clean up whitespace to avoid errors due to inconsistent format timeparts[0] = timeparts[0].replace(' ', '') timeparts[1] = timeparts[1].replace(' ', '') parkingstarttimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[0], '%m/%d/%Y%I:%M%p') parkingstarttime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S') parkingendtimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[1], '%m/%d/%Y%I:%M%p') parkingendtime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S') # add this special event info to the ParkingSpecialEvents collection specialevents['ParkingSpecialEvents'].append({"ParkingLocation":parkinglocation, "EventVenue":eventvenue, "EventTime":eventtime, "Event":event, "ParkingStartTime":parkingstarttime, "ParkingEndTime":parkingendtime}) # setting content var to keep contract with caller exactly in-tact (for now). result.content = json.dumps(specialevents) done = True; # problem hiting url, try a few times except urlfetch.DownloadError: logging.error("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(6) loop = loop+1 # This is bad. Some data may be in a differnt format due to # either unexpected data entry or *gulp* site redeisgn. # Likely require code change to fix. except ValueError: logging.error("Error parsing scraped content from (%s)... exiting getParkingSpecialEvents()" % specialeventsurl) done = True result = None return result
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL, direction, routeID)) loop = 0 done = False result = None #start = quota.get_request_cpu_usage() while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop + 1 #end = quota.get_request_cpu_usage() #logging.info("scraping took %s cycles" % (end-start)) # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a", "ada"): logging.info("pulling out data from page... %s" % slot) if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID, intersection)) # check for conflicts... stop = db.GqlQuery( "SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error( "Missing stop %s which should be impossible" % stopID) # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info( "FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData, stopID)) routeArgs = routeData.split('&') routeID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery( "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info( "added new route listing entry to the database!" ) else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) #else: # title.split(",")[0].isdigit(): else: if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href if routeID == '00': routeID = href.split('r=')[1] elif href.find("&") > -1: routeID = href.split('&')[0].split('r=')[1] task = Task(url='/crawl/routelist/crawlingtask', params={ 'crawl': crawlURL, 'direction': title, 'routeID': routeID }) task.add('crawler') logging.info( "Added new task for %s, direction %s, route %s" % (title.split(",")[0], title, routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-', '').replace(' ', '').isalpha(): logging.info( "found the route LABEL page! href: %s" % href) routeData = href.split('?')[1] routeArgs = routeData.split('&') directionID = routeArgs[1].split('=')[1] l = DestinationListing.get_or_insert( title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return
def parse_special_events_html(self, special_events_html): special_events = dict() special_events["specialEvents"] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) event_tables = soup.findAll("table", {"class": "event-item"}) for event_table in event_tables: rows = event_table.findAll("tr") event_time = "" event_date = "" lot_num_array = [] event_name = "" for row_index in range(0, 3): if row_index == 0: # we're on the header row header_content = rows[row_index].find("th").string header_array = header_content.split(":") event_date = header_array[0] event_name = header_array[1].replace(" ", "") elif row_index == 1: # time row cells = rows[row_index].findAll("td") cell_content = cells[1].string event_time_arr = cell_content.split(" ", 2) event_time = event_time_arr[0] + event_time_arr[1].replace(".", "").upper() elif row_index == 2: # lots row cells = rows[row_index].findAll("td") cell_content = cells[1].string lot_num_array = cell_content.replace(" ", "").split(",") # strip leading 0's out of lot number array for index, item in enumerate(lot_num_array): lot_num_array[index] = self.strip_leading_zeros_from_short_name(item) try: # the most brittle part of the parse if len(event_time) == 1: event_time = "0" + event_time event_datetime_tmp = event_date + " " + event_time event_datetime_str = datetime.datetime.strptime(event_datetime_tmp, "%m/%d/%Y %I:%M%p").strftime( "%Y-%m-%dT%H:%M:%S" ) except ValueError: logging.error("Error parsing campus special event date") event_datetime_str = None # Rather than exclude props not currently available via uw lots, going with "None" # This will manifest in the json as "property":null which should be easily detectable via client JS special_event = { "eventName": event_name, "parkingLocations": lot_num_array, "eventDatetime": event_datetime_str, "parkingStartDatetime": None, "parkingEndDatetime": None, "eventVenue": None, "webUrl": self.parking_data["special_events_url"], } special_events["specialEvents"].append(special_event) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error("Error parsing scraped content from campus special events page." + str(e)) special_events["specialEvents"] = [] return special_events
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) event_tables = soup.findAll('table', {'class': 'event-item'}) for event_table in event_tables: rows = event_table.findAll('tr') event_time = '' event_date = '' lot_num_array = [] event_name = '' for row_index in range(0, 3): if row_index == 0: # we're on the header row header_content = rows[row_index].find('th').string header_array = header_content.split(':') event_date = header_array[0] event_name = header_array[1].replace(' ', '') elif row_index == 1: # time row cells = rows[row_index].findAll('td') cell_content = cells[1].string event_time_arr = cell_content.split(' ', 2) event_time = event_time_arr[0] + event_time_arr[ 1].replace('.', '').upper() elif row_index == 2: # lots row cells = rows[row_index].findAll('td') cell_content = cells[1].string lot_num_array = cell_content.replace(' ', '').split(',') # strip leading 0's out of lot number array for index, item in enumerate(lot_num_array): lot_num_array[ index] = self.strip_leading_zeros_from_short_name( item) try: # the most brittle part of the parse if len(event_time) == 1: event_time = '0' + event_time event_datetime_tmp = event_date + ' ' + event_time event_datetime_str = datetime.datetime.strptime( event_datetime_tmp, '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%dT%H:%M:%S') except ValueError: logging.error('Error parsing campus special event date') event_datetime_str = None # Rather than exclude props not currently available via uw lots, going with "None" # This will manifest in the json as "property":null which should be easily detectable via client JS special_event = { 'eventName': event_name, 'parkingLocations': lot_num_array, 'eventDatetime': event_datetime_str, 'parkingStartDatetime': None, 'parkingEndDatetime': None, 'eventVenue': None, 'webUrl': self.parking_data['special_events_url'] } special_events['specialEvents'].append(special_event) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error( 'Error parsing scraped content from campus special events page.' + str(e)) special_events['specialEvents'] = [] return special_events