Python BeautifulSoup 예제들, api.BeautifulSoup.BeautifulSoup Python 예제들

예제 #1

0

파일 보기

    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

        try:
            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find(
                'table', {
                    'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName'
                }).findAll('tr')

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll('td')

                short_name = table_cells[1].string.split(' ')[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {
                    'shortName': short_name,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from campus parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from campus parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from campus parking page.')
            raise TypeError

        except IndexError:
            # Html is probably None
            logging.error(
                'IndexError parsing scraped content from campus parking page.')
            raise IndexError

        return results

예제 #2

0

파일 보기

파일: cityparking.py 프로젝트: pliu6/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {
                'id': 'calendar'
            }).findAll('tr')
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(
                    table_cells)

                # add this special event info to the specialEvents collection
                special_events['specialEvents'].append({
                    'parkingLocation':
                    parking_location,
                    'eventVenue':
                    event_venue,
                    'eventDatetime':
                    event_time,
                    'eventName':
                    event,
                    'parkingStartDatetime':
                    parking_start_time,
                    'parkingEndDatetime':
                    parking_end_time,
                    'webUrl':
                    self.parking_data['special_events_url']
                })

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error(
                'Error parsing scraped content from city special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events

예제 #3

0

파일 보기

파일: campusparking.py 프로젝트: gtracy/madison-transit-api

    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

        try:
            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find("table", {"id": "ctl00_ctl00_central_block_right_navi_cnt_gvName"}).findAll(
                "tr"
            )

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll("td")

                short_name = table_cells[1].string.split(" ")[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {"shortName": short_name, "openSpots": int(lot_spots)}
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error("ValueError parsing scraped content from campus parking page.")
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error("AttributeError parsing scraped content from campus parking page.")
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error("TypeError parsing scraped content from campus parking page.")
            raise TypeError

        except IndexError:
            # Html is probably None
            logging.error("IndexError parsing scraped content from campus parking page.")
            raise IndexError

        return results

예제 #4

0

파일 보기

파일: cityparking.py 프로젝트: pliu6/madison-transit-api

    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

        try:
            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows:  # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from city parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results

예제 #5

0

파일 보기

파일: cityparking.py 프로젝트: chris-skud/madison-transit-api

    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

        try:
            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows: # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error('ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error('AttributeError parsing scraped content from city parking page.')
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error('TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results

예제 #6

0

파일 보기

파일: cityparking.py 프로젝트: chris-skud/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {'id': 'calendar'}).findAll('tr')
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(table_cells)

                # add this special event info to the specialEvents collection
                special_events['specialEvents'].append(
                    {
                        'parkingLocation': parking_location,
                        'eventVenue': event_venue,
                        'eventDatetime': event_time,
                        'eventName': event,
                        'parkingStartDatetime': parking_start_time,
                        'parkingEndDatetime': parking_end_time,
                        'webUrl': self.parking_data['special_events_url']
                    }
                )

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error('Error parsing scraped content from city special events page.' + str(e))
            special_events['specialEvents'] = []

        return special_events

예제 #7

0

파일 보기

파일: getparking.py 프로젝트: chris-skud/madison-transit-api

def getParkingSpecialEvents():
    loop = 0
    done = False
    result = None
    specialeventsurl = 'http://www.cityofmadison.com/parkingUtility/calendar/index.cfm'
    cachehours = 24
    
    #initialize the dict to hold result of scrape.
    specialevents = dict()
    specialevents['CacheUntil'] = datetime.datetime.strftime(api_utils.getLocalDatetime() + datetime.timedelta(hours=+cachehours), '%Y-%m-%dT%H:%M:%S')
    logging.info(specialevents['CacheUntil'])
    specialevents['ParkingSpecialEvents'] = []
    specialevents['LastScraped'] = datetime.datetime.strftime(api_utils.getLocalDatetime(), '%Y-%m-%dT%H:%M:%S')
    
    # Looping in case fetch flaky.
    while not done and loop < 3:
        try:

            #grab the city parking html page - what an awesome API!!! :(
            result = urlfetch.fetch(specialeventsurl)

            #invoke soup to parse html
            soup = BeautifulSoup(result.content)

            # find the calendar table containing special event info.
            # returns array of <tr>'s.
            special_event_rows = soup.find("table", { "id" : "calendar" }).findAll('tr')

            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):

                # grab the array of cells in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parkinglocation = table_cells[1].string
                eventvenue = table_cells[4].string
                event = table_cells[3].string

                # take the event time strings (already central time), create datetime obj, then convert back to correct string
                eventtimeobj = datetime.datetime.strptime(table_cells[0].string + api_utils.get_time_from_text(table_cells[5].string)
                                                          .replace(' ',''), '%m/%d/%Y%I:%M%p')
                eventtime = datetime.datetime.strftime(eventtimeobj, '%Y-%m-%dT%H:%M:%S')

                # split '00:00 pm - 00:00 pm' into start and end strings
                timeparts = table_cells[2].string.split(' - ')

                # clean up whitespace to avoid errors due to inconsistent format
                timeparts[0] = timeparts[0].replace(' ', '')
                timeparts[1] = timeparts[1].replace(' ', '')

                parkingstarttimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[0], '%m/%d/%Y%I:%M%p')
                parkingstarttime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                parkingendtimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[1], '%m/%d/%Y%I:%M%p')
                parkingendtime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                # add this special event info to the ParkingSpecialEvents collection
                specialevents['ParkingSpecialEvents'].append({"ParkingLocation":parkinglocation, "EventVenue":eventvenue, "EventTime":eventtime, "Event":event, "ParkingStartTime":parkingstarttime, "ParkingEndTime":parkingendtime})

                # setting content var to keep contract with caller exactly in-tact (for now).
            result.content = json.dumps(specialevents)

            done = True;

        # problem hiting url, try a few times
        except urlfetch.DownloadError:
            logging.error("Error loading page (%s)... sleeping" % loop)
            if result:
                logging.debug("Error status: %s" % result.status_code)
                logging.debug("Error header: %s" % result.headers)
                logging.debug("Error content: %s" % result.content)
            time.sleep(6)
            loop = loop+1

        # This is bad. Some data may be in a differnt format due to 
        # either unexpected data entry or *gulp* site redeisgn.
        # Likely require code change to fix.
        except ValueError:
            logging.error("Error parsing scraped content from (%s)... exiting getParkingSpecialEvents()" % specialeventsurl)
            done = True
            result = None
    return result

예제 #8

0

파일 보기

    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" %
                          (scrapeURL, direction, routeID))

            loop = 0
            done = False
            result = None
            #start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop + 1
            #end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a", "ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" %
                                     (stopID, intersection))

                        # check for conflicts...
                        stop = db.GqlQuery(
                            "SELECT * FROM StopLocation WHERE stopID = :1",
                            stopID).get()
                        if stop is None:
                            logging.error(
                                "Missing stop %s which should be impossible" %
                                stopID)

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info(
                            "FOUND THE PAGE ---> arguments: %s stopID: %s" %
                            (routeData, stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery(
                            "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                            routeID, directionID, stopID).get()
                        if r is None:
                            # add the new route to the DB
                            route = RouteListing()
                            route.route = routeID
                            route.direction = directionID
                            route.stopID = stopID
                            route.scheduleURL = timeEstimatesURL
                            route.stopLocation = stop
                            route.put()
                            logging.info(
                                "added new route listing entry to the database!"
                            )
                        else:
                            logging.error("we found a duplicate entry!?! %s",
                                          r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    else:
                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            if routeID == '00':
                                routeID = href.split('r=')[1]
                            elif href.find("&") > -1:
                                routeID = href.split('&')[0].split('r=')[1]
                            task = Task(url='/crawl/routelist/crawlingtask',
                                        params={
                                            'crawl': crawlURL,
                                            'direction': title,
                                            'routeID': routeID
                                        })
                            task.add('crawler')
                            logging.info(
                                "Added new task for %s, direction %s, route %s"
                                % (title.split(",")[0], title, routeID))
                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-', '').replace(' ', '').isalpha():
                            logging.info(
                                "found the route LABEL page! href: %s" % href)
                            routeData = href.split('?')[1]
                            routeArgs = routeData.split('&')
                            directionID = routeArgs[1].split('=')[1]

                            l = DestinationListing.get_or_insert(
                                title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return

예제 #9

0

파일 보기

파일: campusparking.py 프로젝트: gtracy/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events["specialEvents"] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)
            event_tables = soup.findAll("table", {"class": "event-item"})

            for event_table in event_tables:
                rows = event_table.findAll("tr")
                event_time = ""
                event_date = ""
                lot_num_array = []
                event_name = ""
                for row_index in range(0, 3):
                    if row_index == 0:  # we're on the header row
                        header_content = rows[row_index].find("th").string
                        header_array = header_content.split(":")
                        event_date = header_array[0]
                        event_name = header_array[1].replace("&nbsp;", "")
                    elif row_index == 1:  # time row
                        cells = rows[row_index].findAll("td")
                        cell_content = cells[1].string
                        event_time_arr = cell_content.split(" ", 2)
                        event_time = event_time_arr[0] + event_time_arr[1].replace(".", "").upper()
                    elif row_index == 2:  # lots row
                        cells = rows[row_index].findAll("td")
                        cell_content = cells[1].string
                        lot_num_array = cell_content.replace(" ", "").split(",")

                        # strip leading 0's out of lot number array
                        for index, item in enumerate(lot_num_array):
                            lot_num_array[index] = self.strip_leading_zeros_from_short_name(item)

                try:  # the most brittle part of the parse
                    if len(event_time) == 1:
                        event_time = "0" + event_time
                    event_datetime_tmp = event_date + " " + event_time
                    event_datetime_str = datetime.datetime.strptime(event_datetime_tmp, "%m/%d/%Y %I:%M%p").strftime(
                        "%Y-%m-%dT%H:%M:%S"
                    )

                except ValueError:
                    logging.error("Error parsing campus special event date")
                    event_datetime_str = None

                # Rather than exclude props not currently available via uw lots, going with "None"
                # This will manifest in the json as "property":null which should be easily detectable via client JS
                special_event = {
                    "eventName": event_name,
                    "parkingLocations": lot_num_array,
                    "eventDatetime": event_datetime_str,
                    "parkingStartDatetime": None,
                    "parkingEndDatetime": None,
                    "eventVenue": None,
                    "webUrl": self.parking_data["special_events_url"],
                }
                special_events["specialEvents"].append(special_event)

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error("Error parsing scraped content from campus special events page." + str(e))
            special_events["specialEvents"] = []

        return special_events

예제 #10

0

파일 보기

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)
            event_tables = soup.findAll('table', {'class': 'event-item'})

            for event_table in event_tables:
                rows = event_table.findAll('tr')
                event_time = ''
                event_date = ''
                lot_num_array = []
                event_name = ''
                for row_index in range(0, 3):
                    if row_index == 0:  # we're on the header row
                        header_content = rows[row_index].find('th').string
                        header_array = header_content.split(':')
                        event_date = header_array[0]
                        event_name = header_array[1].replace('&nbsp;', '')
                    elif row_index == 1:  # time row
                        cells = rows[row_index].findAll('td')
                        cell_content = cells[1].string
                        event_time_arr = cell_content.split(' ', 2)
                        event_time = event_time_arr[0] + event_time_arr[
                            1].replace('.', '').upper()
                    elif row_index == 2:  # lots row
                        cells = rows[row_index].findAll('td')
                        cell_content = cells[1].string
                        lot_num_array = cell_content.replace(' ',
                                                             '').split(',')

                        # strip leading 0's out of lot number array
                        for index, item in enumerate(lot_num_array):
                            lot_num_array[
                                index] = self.strip_leading_zeros_from_short_name(
                                    item)

                try:  # the most brittle part of the parse
                    if len(event_time) == 1:
                        event_time = '0' + event_time
                    event_datetime_tmp = event_date + ' ' + event_time
                    event_datetime_str = datetime.datetime.strptime(
                        event_datetime_tmp,
                        '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%dT%H:%M:%S')

                except ValueError:
                    logging.error('Error parsing campus special event date')
                    event_datetime_str = None

                # Rather than exclude props not currently available via uw lots, going with "None"
                # This will manifest in the json as "property":null which should be easily detectable via client JS
                special_event = {
                    'eventName': event_name,
                    'parkingLocations': lot_num_array,
                    'eventDatetime': event_datetime_str,
                    'parkingStartDatetime': None,
                    'parkingEndDatetime': None,
                    'eventVenue': None,
                    'webUrl': self.parking_data['special_events_url']
                }
                special_events['specialEvents'].append(special_event)

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error(
                'Error parsing scraped content from campus special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events