Ejemplos de BeautifulSoup.find en Python, ejemplos de api.BeautifulSoup.BeautifulSoup.find en Python

Ejemplo n.º 1

0

Mostrar archivo

    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

        try:
            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find(
                'table', {
                    'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName'
                }).findAll('tr')

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll('td')

                short_name = table_cells[1].string.split(' ')[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {
                    'shortName': short_name,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from campus parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from campus parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from campus parking page.')
            raise TypeError

        except IndexError:
            # Html is probably None
            logging.error(
                'IndexError parsing scraped content from campus parking page.')
            raise IndexError

        return results

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cityparking.py Proyecto: pliu6/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {
                'id': 'calendar'
            }).findAll('tr')
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(
                    table_cells)

                # add this special event info to the specialEvents collection
                special_events['specialEvents'].append({
                    'parkingLocation':
                    parking_location,
                    'eventVenue':
                    event_venue,
                    'eventDatetime':
                    event_time,
                    'eventName':
                    event,
                    'parkingStartDatetime':
                    parking_start_time,
                    'parkingEndDatetime':
                    parking_end_time,
                    'webUrl':
                    self.parking_data['special_events_url']
                })

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error(
                'Error parsing scraped content from city special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events

Ejemplo n.º 3

0

Mostrar archivo

Archivo: campusparking.py Proyecto: gtracy/madison-transit-api

    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

        try:
            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find("table", {"id": "ctl00_ctl00_central_block_right_navi_cnt_gvName"}).findAll(
                "tr"
            )

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll("td")

                short_name = table_cells[1].string.split(" ")[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {"shortName": short_name, "openSpots": int(lot_spots)}
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error("ValueError parsing scraped content from campus parking page.")
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error("AttributeError parsing scraped content from campus parking page.")
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error("TypeError parsing scraped content from campus parking page.")
            raise TypeError

        except IndexError:
            # Html is probably None
            logging.error("IndexError parsing scraped content from campus parking page.")
            raise IndexError

        return results

Ejemplo n.º 4

0

Mostrar archivo

Archivo: cityparking.py Proyecto: pliu6/madison-transit-api

    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

        try:
            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows:  # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from city parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results

Ejemplo n.º 5

0

Mostrar archivo

Archivo: cityparking.py Proyecto: chris-skud/madison-transit-api

    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

        try:
            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows: # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error('ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error('AttributeError parsing scraped content from city parking page.')
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error('TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results

Ejemplo n.º 6

0

Mostrar archivo

Archivo: cityparking.py Proyecto: chris-skud/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {'id': 'calendar'}).findAll('tr')
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(table_cells)

                # add this special event info to the specialEvents collection
                special_events['specialEvents'].append(
                    {
                        'parkingLocation': parking_location,
                        'eventVenue': event_venue,
                        'eventDatetime': event_time,
                        'eventName': event,
                        'parkingStartDatetime': parking_start_time,
                        'parkingEndDatetime': parking_end_time,
                        'webUrl': self.parking_data['special_events_url']
                    }
                )

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error('Error parsing scraped content from city special events page.' + str(e))
            special_events['specialEvents'] = []

        return special_events

Ejemplo n.º 7

0

Mostrar archivo

Archivo: getparking.py Proyecto: chris-skud/madison-transit-api

def getParkingSpecialEvents():
    loop = 0
    done = False
    result = None
    specialeventsurl = 'http://www.cityofmadison.com/parkingUtility/calendar/index.cfm'
    cachehours = 24
    
    #initialize the dict to hold result of scrape.
    specialevents = dict()
    specialevents['CacheUntil'] = datetime.datetime.strftime(api_utils.getLocalDatetime() + datetime.timedelta(hours=+cachehours), '%Y-%m-%dT%H:%M:%S')
    logging.info(specialevents['CacheUntil'])
    specialevents['ParkingSpecialEvents'] = []
    specialevents['LastScraped'] = datetime.datetime.strftime(api_utils.getLocalDatetime(), '%Y-%m-%dT%H:%M:%S')
    
    # Looping in case fetch flaky.
    while not done and loop < 3:
        try:

            #grab the city parking html page - what an awesome API!!! :(
            result = urlfetch.fetch(specialeventsurl)

            #invoke soup to parse html
            soup = BeautifulSoup(result.content)

            # find the calendar table containing special event info.
            # returns array of <tr>'s.
            special_event_rows = soup.find("table", { "id" : "calendar" }).findAll('tr')

            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):

                # grab the array of cells in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parkinglocation = table_cells[1].string
                eventvenue = table_cells[4].string
                event = table_cells[3].string

                # take the event time strings (already central time), create datetime obj, then convert back to correct string
                eventtimeobj = datetime.datetime.strptime(table_cells[0].string + api_utils.get_time_from_text(table_cells[5].string)
                                                          .replace(' ',''), '%m/%d/%Y%I:%M%p')
                eventtime = datetime.datetime.strftime(eventtimeobj, '%Y-%m-%dT%H:%M:%S')

                # split '00:00 pm - 00:00 pm' into start and end strings
                timeparts = table_cells[2].string.split(' - ')

                # clean up whitespace to avoid errors due to inconsistent format
                timeparts[0] = timeparts[0].replace(' ', '')
                timeparts[1] = timeparts[1].replace(' ', '')

                parkingstarttimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[0], '%m/%d/%Y%I:%M%p')
                parkingstarttime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                parkingendtimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[1], '%m/%d/%Y%I:%M%p')
                parkingendtime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                # add this special event info to the ParkingSpecialEvents collection
                specialevents['ParkingSpecialEvents'].append({"ParkingLocation":parkinglocation, "EventVenue":eventvenue, "EventTime":eventtime, "Event":event, "ParkingStartTime":parkingstarttime, "ParkingEndTime":parkingendtime})

                # setting content var to keep contract with caller exactly in-tact (for now).
            result.content = json.dumps(specialevents)

            done = True;

        # problem hiting url, try a few times
        except urlfetch.DownloadError:
            logging.error("Error loading page (%s)... sleeping" % loop)
            if result:
                logging.debug("Error status: %s" % result.status_code)
                logging.debug("Error header: %s" % result.headers)
                logging.debug("Error content: %s" % result.content)
            time.sleep(6)
            loop = loop+1

        # This is bad. Some data may be in a differnt format due to 
        # either unexpected data entry or *gulp* site redeisgn.
        # Likely require code change to fix.
        except ValueError:
            logging.error("Error parsing scraped content from (%s)... exiting getParkingSpecialEvents()" % specialeventsurl)
            done = True
            result = None
    return result