def scrape(self): url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" page = self.lxmlize(url) for entry in page.xpath( "//tr[@style='font-family: Verdana; font-size: 12px;']"): name, when, links = entry.xpath(".//td") name = name.text.strip().replace(u"\xc2\xa0", "") when = when.text.strip().replace(u"\xc2\xa0", "") when = dt.datetime.strptime(when, "%m/%d/%Y") links = links.xpath(".//a") links = {x.text: x.attrib['href'] for x in links} e = Event(name=name, when=when, location='unknown') e.add_source(url) for note, url in links.items(): e.add_link(note=note, url=url) yield e
def scrape(self): url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" page = self.lxmlize(url) for entry in page.xpath( "//tr[@style='font-family: Verdana; font-size: 12px;']"): name, when, links = entry.xpath(".//td") name = name.text.strip().replace(u"\xc2\xa0", "") when = when.text.strip().replace(u"\xc2\xa0", "") when = dt.datetime.strptime(when, "%m/%d/%Y") links = links.xpath(".//a") links = {x.text: x.attrib['href'] for x in links} e = Event(name=name, when=when, location='unknown') e.add_source(url) for note, url in links.items(): e.add_link(note=note, url=url) yield e
def scrape(self): url = "http://phila.legistar.com/Calendar.aspx/" page = self.lxmlize(url) main = page.xpath("//table[@class='rgMasterTable']")[0] rows = main.xpath(".//tr")[1:] for row in rows: if "No records were found." in row.text_content(): self.warning("Hum. They don't seem to have events?") continue (name, date, _, time, where, agenda, minutes) = row.xpath(".//td") # _ nom's the image next to the date on the page. name = name.text_content().strip() # leaving an href on the table time = time.text_content().strip() location = where.text_content().strip() if "Deferred" in time: continue all_day = False if time == "": all_day = True when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y") else: when = dt.datetime.strptime( "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p") event = Event(name=name, when=when, location=location) event.add_source(url) agendas = agenda.xpath(".//a[@href]") for a in agendas: event.add_link(a.text, a.attrib['href']) minutes = minutes.xpath(".//a[@href]") for minute in minutes: event.add_link(minute.text, minute.attrib['href']) yield event
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells( meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp( int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a')) > 0: meeting_agenda_url = cell_mapping['agenda'].cssselect( 'a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring( meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href', '') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type == 'upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell) > 0: video_url_match = re.search( r"http://(.*?)'", video_cell[0].attrib.get('onclick', '')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell) > 0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell) > 0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get( 'href', ''), mimetype='text/html') yield e