def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [ x.text_content().strip() for x in row.xpath('td') ] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) (description, member_list, meeting_type, other) = self.parse_agenda(chamber, link) event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) event.add_participant('committee', committee) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape_committee_upcoming(self, session, chamber): chamber_name = {'upper': 'senate', 'lower': 'house'}[chamber] url = ("http://www.capitol.state.tx.us/MyTLO/RSS/RSS.aspx?" "Type=upcomingmeetings%s" % chamber_name) with self.urlopen(url) as page: feed = feedparser.parse(page) for entry in feed['entries']: title, date = entry['title'].split(' - ') time = re.match('Time: (\d+:\d+ (A|P)M)', entry['description']).group(1) when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') when = self._tz.localize(when) location = entry['description'].split('Location: ')[1] description = 'Committee Meeting\n' description += entry['title'] + '\n' description += entry['description'] event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant('committee', title) event['_guid'] = entry['guid'] event['link'] = entry['link'] event.add_source(url) self.save_event(event)