def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): guid = link.attrib['href'] committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] when = parse_datetime(when_and_where, session) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant('committee', committee) event['link'] = guid self.save_event(event)
def scrape_page(self, url, session, chamber): try: page = self.lxmlize(url) except lxml.etree.XMLSyntaxError: self.warning("Ugh. Invalid HTML") return # Ugh, invalid HTML. agendas = page.xpath("//td[@class='numberspace']") spans = page.xpath("//center/span") ctty = None date = None time = None if len(spans) >= 4: ctty = spans[0].text_content().strip() date = spans[2].text_content().strip() time = spans[3].text_content().strip() bills = [] for agenda in agendas: number = agenda.text_content() string = agenda.getnext().text_content().strip() re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string) for bill in re_bills: bill_id = "%s%s %s" % bill bills.append({"name": bill_id, "desc": string}) if ctty is None or date is None or time is None: return datetime = "%s %s" % (date.strip(), time.strip()) datetime = re.sub("AGENDA", "", datetime).strip() datetime = [x.strip() for x in datetime.split("\r\n")] if "" in datetime: datetime.remove("") if len(datetime) == 1: datetime.append("state house") where = datetime[1] translate = {"a.m.": "AM", "p.m.": "PM"} for t in translate: datetime[0] = datetime[0].replace(t, translate[t]) datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p") chamber = "other" cLow = ctty.lower() if "seante" in cLow: chamber = "upper" elif "house" in cLow: chamber = "lower" elif "joint" in cLow: chamber = "joint" event = Event(session, datetime, "committee:meeting", ctty, location=where) event.add_source(url) event.add_participant("host", ctty, "committee", chamber=chamber) for bill in bills: event.add_related_bill(bill["name"], description=bill["desc"], type="consideration") self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.urlopen(url) page = csv.reader(StringIO.StringIO(page.bytes), delimiter="|") for row in page: desc = row[7].strip() match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc) if match: comm_chamber = {"HOUSE": "lower", "SENATE": "upper"}[match.group(2)] if comm_chamber != chamber: continue comm = match.group(1).strip() comm = re.sub(r"\s+", " ", comm) location = row[5].strip() or "Unknown" when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S") event = Event(session, when, "committee:meeting", "%s MEETING" % comm, location=location) event.add_source(url) event.add_participant("committee", comm, chamber=chamber) time = row[3].strip() if time in TIMECODES: event["notes"] = TIMECODES[time] self.save_event(event)
def scrape(self, chamber, session): bills_discussed = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue bills_discussed[(location, date)].append(hearing.bill_id) for ((location, date), bills) in bills_discussed.iteritems(): bills = [ "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bills ] desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location, ', '.join(bills)) event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location) self.save_event(event)
def scrape_page(self, url, chamber, session): page = self.lxmlize(url) info_blocks = { "canceled": "//div[@class='cancelled']", "committee": "//div[@class='titlemeetingtype']", "chamber": "//div[@class='titlehouse']", "datetime": "//div[@class='datetimelocation']" } metainf = {} for block in info_blocks: info = page.xpath(info_blocks[block]) if info == []: continue metainf[block] = { "obj": info[0], "txt": info[0].text_content() } if 'committee' not in metainf: return if 'canceled' in metainf: return obj = metainf['datetime']['obj'] dates = obj.xpath("./*") date_time = obj.text.strip() for date in dates: if date.tail is not None: date_time += " %s" % (date.tail.strip()) # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol splits = [ 'AM', 'PM' ] date_times = None for split in splits: if split in date_time: date_times = [ x.strip() for x in date_time.split(split, 1) ] date_times[0] += " " + split time = date_times[0] place = date_times[1] committee = metainf['committee']['txt'] chamber = metainf['chamber']['txt'] try: chamber = { "Senate": "upper", "Assembly": "lower", "Joint": "joint" }[chamber] except KeyError: chamber = 'other' # Wednesday, May 23, 2012 10:00 AM datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', committee, location=place) event.add_participant('host', committee, 'committee', chamber=chamber) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): bills_discussed = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue bills_discussed[(location, date)].append(hearing.bill_id) for ((location, date), bills) in bills_discussed.iteritems(): bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bills] desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location, ', '.join(bills)) event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location, 'committee') self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.urlopen(url) page = csv.reader(StringIO.StringIO(page.bytes), delimiter='|') for row in page: desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm_chamber = {'HOUSE': 'lower', 'SENATE': 'upper'}[match.group(2)] if comm_chamber != chamber: continue comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') event = Event(session, when, 'committee:meeting', "%s MEETING" % comm, location=location) event.add_source(url) event.add_participant('host', comm, chamber=chamber) time = row[3].strip() if time in TIMECODES: event['notes'] = TIMECODES[time] self.save_event(event)
def scrape_page(self, url, chamber, session): page = self.lxmlize(url) info_blocks = { "canceled": "//div[@class='cancelled']", "committee": "//div[@class='titlemeetingtype']", "chamber": "//div[@class='titlehouse']", "datetime": "//div[@class='datetimelocation']" } metainf = {} for block in info_blocks: info = page.xpath(info_blocks[block]) if info == []: continue metainf[block] = {"obj": info[0], "txt": info[0].text_content()} if 'committee' not in metainf: return if 'canceled' in metainf: return obj = metainf['datetime']['obj'] dates = obj.xpath("./*") date_time = obj.text.strip() for date in dates: if date.tail is not None: date_time += " %s" % (date.tail.strip()) # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol splits = ['AM', 'PM'] date_times = None for split in splits: if split in date_time: date_times = [x.strip() for x in date_time.split(split, 1)] date_times[0] += " " + split time = date_times[0] place = date_times[1] committee = metainf['committee']['txt'] chamber = metainf['chamber']['txt'] try: chamber = { "Senate": "upper", "Assembly": "lower", "Joint": "joint" }[chamber] except KeyError: chamber = 'other' # Wednesday, May 23, 2012 10:00 AM datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', committee, location=place) event.add_participant('host', committee, chamber=chamber) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): seen = set() for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue if (location, date) in seen: continue seen.add((location, date)) desc = 'Committee Meeting\n%s' % location event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location) self.save_event(event)
def scrape(self, chamber, session): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" page = self.urlopen(url) page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue if ':' not in time: self.warning('skipping event with invalid time: %s', time) continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. event.add_participant('host', desc, 'committee', chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. event.add_participant('host', desc, 'committee', chamber=chamber) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] when = parse_datetime(when_and_where, session) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") event['link'] = guid self.save_event(event)
def actions_to_events(state): for bill in db.bills.find({'state': state}): print "Converting %s actions to events" % bill['_id'] count = 1 for action in bill['actions']: guid = "%s:action:%06d" % (bill['_id'], count) count += 1 event = db.events.find_one({'state': state, '_guid': guid}) description = "%s: %s" % (bill['bill_id'], action['action']) data = Event(bill['session'], action['date'], 'bill:action', description, location=action['actor'], action_type=action['type']) data.add_participant('actor', action['actor']) data['_guid'] = guid data['state'] = state if not event: data['created_at'] = datetime.datetime.utcnow() data['updated_at'] = data['created_at'] _insert_with_id(data) else: update(event, data, db.events)
def scrape_committee_events(self, session, code, name): url = "http://www.cga.ct.gov/asp/menu/" "CGACommCal.asp?comm_code=%s" % code with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) cal_table = page.xpath("//table[contains(@summary, 'Calendar')]")[0] date_str = None for row in cal_table.xpath("tr[2]//tr"): col1 = row.xpath("string(td[1])").strip() col2 = row.xpath("string(td[2])").strip() if not col1: if col2 == "No Meetings Scheduled": return # If col1 is empty then this is a date header date_str = col2 else: # Otherwise, this is a committee event row when = date_str + " " + col1 when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p") when = self._tz.localize(when) location = row.xpath("string(td[3])").strip() guid = row.xpath("td/a")[0].attrib["href"] event = Event(session, when, "committee meeting", col2, location, _guid=guid) event.add_source(url) event.add_participant("committee", name, chamber="joint") self.save_event(event)
def scrape_event(self, chamber, session, obj): meeting = obj['data']['meeting'] date = int(meeting['meetingDateTime']) date = dt.datetime.fromtimestamp(date / 1000) if str(date.year) not in session: return description = 'Committee Meeting: ' + meeting['committeeName'] event = Event(session, date, 'committee:meeting', description=description, location=meeting['location'] or 'No location given.') event.add_source(obj['url']) event.add_participant('chair', meeting['committeeChair'], 'legislator', chamber='upper') event.add_participant('host', meeting['committeeName'], 'committee', chamber='upper') rgx = r'([a-z]+)(\d+)' for bill in meeting['bills']: raw_id = bill['senateBillNo'] bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups()) event.add_related_bill(bill_id, type='bill', description=bill['summary'] or 'No description given.') return event
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] if not when.endswith(session[:len("20XX")]): continue event = Event(session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol') event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) doc = self.lxmlize(url) event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() if committee_name.lower().startswith("Senate"): chamber = "upper" elif committee_name.lower().startswith("House"): chamber = "lower" else: chamber = "joint" event.add_participant(type='host', participant=committee_name, participant_type='committee', chamber=chamber) documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document(name=document.xpath('text()')[0], url=url, mimetype='application/pdf') bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] event.add_related_bill( bill_name, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] html_ = self.get(url).text doc = html.fromstring(html_) if chamber == 'upper': event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [x.text_content().strip() for x in row.xpath('td')] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) agenda_info = self.parse_agenda(chamber, link) description = agenda_info['description'] member_list = agenda_info['member_list'] related_bills = agenda_info['related_bills'] event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description, related_bills=related_bills) event.add_participant('host', committee, 'committee', chamber=chamber) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) # print event['when'].timetuple() # import ipdb;ipdb.set_trace() self.save_event(event)
def scrape(self, chamber, session): start_date = "%s-01-10T00:00:00" % session[0:4] end_date = "%d-01-10T00:00:00" % (int(session[5:10]) + 1) url = ("http://wslwebservices.leg.wa.gov/CommitteeMeetingService" x ".asmx/GetCommitteeMeetings?beginDate=%s" "&endDate=%s" % (start_date, end_date)) expected_agency = {'upper': 'Senate', 'lower': 'House'}[chamber] with self.urlopen(url) as page: page = lxml.etree.fromstring(page) for meeting in page.xpath( "//wa:CommitteeMeeting", namespaces=self._ns): cancelled = meeting.xpath( "string(wa:Cancelled)", namespaces=self._ns).strip() if cancelled.lower() == "true": continue agency = meeting.xpath( "string(wa:Agency)", namespaces=self._ns).strip() if agency != expected_agency: continue dt = meeting.xpath("string(wa:Date)", namespaces=self._ns) dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S") room = meeting.xpath("string(wa:Room)", namespaces=self._ns) building = meeting.xpath( "string(wa:Building)", namespaces=self._ns) location = "%s, %s" % (room, building) comm = meeting.xpath( "string(wa:Committees/wa:Committee[1]/wa:Name)", namespaces=self._ns) desc = "Committee Meeting\n%s" % comm guid = meeting.xpath( "string(wa:AgendaId)", namespaces=self._ns) event = Event(session, dt, 'committee:meeting', desc, location=location, _guid=guid) for comm_part in meeting.xpath( "wa:Committees/wa:Committee", namespaces=self._ns): name = comm_part.xpath("string(wa:Name)", namespaces=self._ns) agency = comm_part.xpath("string(wa:Agency)", namespaces=self._ns) name = "%s %s Committee" % (agency, name) event.add_participant('committee', name) self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath( '//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath( '//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [x.text_content().strip() for x in row.xpath('td')] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) (description, member_list, meeting_type, other) = self.parse_agenda(chamber, link) event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) event.add_participant('committee', committee) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)", when_and_where).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape(self, chamber, session): if chamber != "other": return None page = self.lxmlize(url) meetings = page.xpath("//div[@class='Comm_item']") for meeting in meetings: metas = meeting.xpath(".//b") ctty = meeting.xpath(".//a")[0] ctty_name = ctty.text_content() info = metas[1:] datetime = metas[0] metainf = {} for meta in info: header = meta.text_content().strip() val = meta.tail metainf[header] = val or "" datetime = datetime.text_content().strip() # Tuesday, June 05, 2012 9:00 AM if "Canceled" in datetime: continue formats = [ "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y" ] date_time = None for fmt in formats: try: date_time = dt.datetime.strptime( datetime, fmt) except ValueError: pass if date_time is None: continue event = Event(chamber, date_time, 'committee:meeting', ctty_name, location=metainf['Room:'] or "State House" ) event.add_source(url) chamber = "other" chambers = { "house": "lower", "joint": "joint", "senate": "upper", } for c in chambers: if c in ctty_name.lower(): chamber = chambers[c] event.add_participant('host', ctty_name, chamber=chamber) # add chair? self.save_event(event)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']")[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmt = "%A, %B %d, %Y" if time in all_day: datetime = date else: fmt += " %I:%M %p" datetime = "%s %s" % ( date, time ) datetime = dt.datetime.strptime(datetime, fmt) event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, chamber=chamber) self.save_event(event)
def parse_row(self, row, session, chamber): dates = row.xpath("./td[@class='dateCell']") for date in dates: # alright, so we *may* not get a date, in which case the date # is the same as the last event. cal_date = date.xpath("./span[@class='calendarMonth']")[0] cal_day = date.xpath("./span[@class='calendarDay']")[0] self.last_month = cal_date.text_content() self.last_day = cal_day.text_content() time = row.xpath("./td[@class='timeCell']") if not time: return # Nada. time = time[0] time = time.text.strip() dt_string = "%s %s %s %s" % ( self.last_month, self.last_day, self.year, time ) fmt = "%b %d %Y %I:%M %p" when = dt.datetime.strptime(dt_string, fmt) cells = { "event": "eventCell", "status": "statusCell", "location": "locationCell", "transcript": "transcriptCell", "video": "videoCell" } metainf = {} for thing in cells: mi = row.xpath("./td[@class='" + cells[thing] + "']") if mi == []: continue metainf[thing] = mi[0] if metainf['location'].xpath("./*") == []: metainf['location'] = self.last_location else: self.last_location = metainf['location'] if "Session" in metainf['event'].text_content().strip(): return # Nada. loc_url = metainf['location'].xpath(".//a") loc_url = loc_url[0].attrib['href'] event = Event(session, when, 'committee:meeting', metainf['event'].text_content().strip(), chamber=chamber, location=metainf['location'].text_content().strip(), location_url=loc_url) event.add_participant("host", metainf['event'].text_content().strip(), 'committee', chamber=chamber) self.add_agenda(event, metainf['event'] .xpath(".//a")[0].attrib['href']) return event
def scrape(self, session, chambers): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" page = self.get(url).text page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue if time == "Noon": time = "12:00pm" if ':' not in time: self.warning('skipping event with invalid time: %s', time) continue when = "%s %s" % (date, time) try: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") except ValueError: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. if desc.lower().strip() in ["house convenes", "senate convenes"]: continue event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. if "house" in desc.lower(): chamber = "lower" elif "senate" in desc.lower(): chamber = "upper" elif "joint" in desc.lower(): chamber = "joint" else: self.logger.warning("Event %s chamber is unknown, skipping" % desc) continue event.add_participant('host', desc, 'committee', chamber=chamber) self.save_event(event)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [ committee, ] for committee in committees: if "INFO" not in committee: committee = self.short_ids[committee] else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill(bill['bill_id'], description=bill['descr'], type=bill['type']) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [ row for row in meeting_rows if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content() ] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src, ' '"PDF-AGENDA.png")]]/@href')[0] self.logger.debug(guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ( [s.strip() for s in meeting_string.split(',') if s] + [None] * 3)[:3] self.logger.debug(location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) self.logger.debug(description) event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee_name, 'committee', chamber='lower') event.add_document('Agenda', guid, type='agenda', mimetype='application/pdf') event['link'] = guid self.save_event(event)
def parse_row(self, row, session, chamber): dates = row.xpath("./td[@class='dateCell']") for date in dates: # alright, so we *may* not get a date, in which case the date # is the same as the last event. cal_date = date.xpath("./span[@class='calendarMonth']")[0] cal_day = date.xpath("./span[@class='calendarDay']")[0] self.last_month = cal_date.text_content() self.last_day = cal_day.text_content() time = row.xpath("./td[@class='timeCell']") if not time: return # Nada. time = time[0] time = time.text.strip() dt_string = "%s %s %s %s" % ( self.last_month, self.last_day, self.year, time ) fmt = "%b %d %Y %I:%M %p" when = dt.datetime.strptime(dt_string, fmt) cells = { "event": "eventCell", "status": "statusCell", "location": "locationCell", "transcript": "transcriptCell", "video": "videoCell" } metainf = {} for thing in cells: mi = row.xpath("./td[@class='" + cells[thing] + "']") if mi == []: continue metainf[thing] = mi[0] if metainf['location'].xpath("./*") == []: metainf['location'] = self.last_location else: self.last_location = metainf['location'] if "Session" in metainf['event'].text_content().strip(): return # Nada. loc_url = metainf['location'].xpath(".//a") loc_url = loc_url[0].attrib['href'] event = Event(session, when, 'committee:meeting', metainf['event'].text_content().strip(), chamber=chamber, location=metainf['location'].text_content().strip(), location_url=loc_url) event.add_participant("host", metainf['event'].text_content().strip(), 'committee', chamber=chamber) self.add_agenda(event, metainf['event'].xpath(".//a")[0].attrib['href']) return event
def scrape(self, chamber, session): if chamber == "other": return today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == "upper": chamber_abbrev = "S" else: chamber_abbrev = "H" url = ( "http://www.legis.iowa.gov/Schedules/meetingsList" "Chamber.aspx?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % ( chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year, ) ) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for link in page.xpath("//a[contains(@id, 'linkCommittee')]"): comm = link.text.strip() desc = comm + " Committee Hearing" location = link.xpath("string(../../td[3])") when = link.xpath("string(../../td[1])").strip() if when == "Cancelled" or "Upon" in when: continue if "To Be Determined" in when: continue if "AM" in when: when = when.split("AM")[0] + " AM" else: when = when.split("PM")[0] + " PM" junk = ["Reception"] for key in junk: when = when.replace(key, "") when = re.sub("\s+", " ", when).strip() when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, "committee:meeting", desc, location) event.add_source(url) event.add_participant("host", comm, "committee", chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): chmbr = cal_chamber_text[chamber] tables = self.url_xpath(cal_weekly_events, "//table[@class='date-table']") for table in tables: date = table.xpath("../.")[0].getprevious().text_content() trs = table.xpath("./tr") for tr in trs: order = ["time", "chamber", "type", "agenda", "location", "video"] tds = tr.xpath("./td") metainf = {} if not tds: continue for el in range(0, len(order)): metainf[order[el]] = tds[el] if metainf['chamber'].text_content() == chmbr: self.log("Skipping event based on chamber.") continue time = metainf['time'].text_content() datetime_string = "%s %s" % (date, time) location = metainf['location'].text_content() description = metainf['type'].text_content() dtfmt = "%A, %B %d, %Y %I:%M %p" if time == 'Cancelled': self.log("Skipping cancelled event.") continue else: if ' Immediately follows' in datetime_string: datetime_string, _ = datetime_string.split( 'Immediately follows') datetime_string = datetime_string.strip() dtfmt = "%A, %B %d, %Y" when = dt.datetime.strptime(datetime_string, dtfmt) event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant( "host", description, 'committee', chamber=chamber) event.add_source(cal_weekly_events) agenda = metainf['agenda'].xpath(".//a") if len(agenda) > 0: agenda = agenda for doc in agenda: if not doc.text_content(): continue agenda_url = doc.attrib['href'] self.add_agenda( agenda_url, doc.text_content(), event) self.save_event(event)
def parse_page(self, url, session): page = self.lxmlize(url) tables = page.xpath("//table[@class='pubhrgtbl']") date = None ctty = None chamber = 'other' for table in tables: metainf = {} rows = table.xpath(".//tr") for row in rows: tds = row.xpath("./*") if len(tds) < 2: continue key, value = tds if key.tag == 'th': date = key.text_content() date = re.sub("\s+", " ", date) date = re.sub(".*POSTPONED NEW DATE", "", date).strip() ctty = value.xpath(".//strong")[0] ctty = ctty.text_content() chamber = 'other' if "senate" in ctty.lower(): chamber = 'upper' if "house" in ctty.lower(): chamber = 'lower' if "joint" in ctty.lower(): chamber = 'joint' elif key.tag == 'td': key = key.text_content().strip() value = value.text_content().strip() value = value.replace(u'\x96', '-') value = re.sub("\s+", " ", value) metainf[key] = value time = metainf['Time:'] repl = {"A.M.": "AM", "P.M.": "PM"} for r in repl: time = time.replace(r, repl[r]) time = re.sub("-.*", "", time) time = time.strip() year = dt.datetime.now().year date = "%s %s %s" % (date, year, time) datetime = dt.datetime.strptime(date, "%B %m %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', metainf['Public Hearing:'], location=metainf['Place:'], contact=metainf['Contact:'], media_contact=metainf['Media Contact:']) event.add_source(url) event.add_participant('host', ctty, chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == 'upper': chamber_abbrev = 'S' else: chamber_abbrev = 'H' url = ("http://www.legis.iowa.gov/Schedules/meetingsList" "Chamber.aspx?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % (chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year)) page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath("//a[contains(@id, 'linkCommittee')]"): comm = link.text.strip() desc = comm + " Committee Hearing" location = link.xpath("string(../../td[3])") when = link.xpath("string(../../td[1])").strip() if 'cancelled' in when.lower() or "upon" in when.lower(): continue if "To Be Determined" in when: continue if 'AM' in when: when = when.split('AM')[0] + " AM" else: when = when.split('PM')[0] + " PM" junk = ['Reception'] for key in junk: when = when.replace(key, '') when = re.sub("\s+", " ", when).strip() if "tbd" in when.lower(): # OK. This is a partial date of some sort. when = datetime.datetime.strptime(when, "%m/%d/%Y TIME - TBD %p") else: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") except ValueError: when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p") event = Event(session, when, 'committee:meeting', desc, location) event.add_source(url) event.add_participant('host', comm, 'committee', chamber=chamber) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src, ' '"PDF-AGENDA.png")]]/@href')[0] self.logger.debug(guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search('\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') self.logger.debug(location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) self.logger.debug(description) event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee_name, 'committee', chamber='lower') event.add_document('Agenda', guid, type='agenda', mimetype='application/pdf') event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join([place_and_time[0].strip(),place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" #when is now of the following format: #Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath(".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+'," ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event(session,when,event_type,description,location) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: #no bills continue bill = bill[0] bill_desc = b.text_content().replace(bill,"").strip(", ").strip() ses,num = bill.split("-") bill = ses.replace(" ","")+"-"+num.zfill(4) if "PR" in bill or "CER" in bill: e.add_related_bill(bill,type="resolution",description=bill_desc) else: e.add_related_bill(bill,type="bill",description=bill_desc) e.add_source(calendar_url) if committee: e.add_participant("host", committee, 'committee', chamber="upper") self.save_event(e)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [committee,] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill( bill['bill_id'], description=bill['descr'], type=bill['type'] ) self.save_event(event)
def scrape(self, chamber, session): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.iteritems(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event(session, date, 'committee:meeting', desc, location=committee_name) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' event.add_related_bill(bill_id, type=type_, description='consideration') event.add_participant('host', committee_name + ' Committee', 'committee', chamber=chamber) event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') self.save_event(event)
def scrape(self, chamber, session): if chamber != "other": return None page = self.lxmlize(url) meetings = page.xpath("//div[@class='Comm_item']") for meeting in meetings: metas = meeting.xpath(".//b") ctty = meeting.xpath(".//a")[0] ctty_name = ctty.text_content() info = metas[1:] datetime = metas[0] metainf = {} for meta in info: header = meta.text_content().strip() val = meta.tail metainf[header] = val or "" datetime = datetime.text_content().strip() # Tuesday, June 05, 2012 9:00 AM if "Canceled" in datetime: continue formats = ["%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y"] date_time = None for fmt in formats: try: date_time = dt.datetime.strptime(datetime, fmt) except ValueError: pass if date_time is None: continue event = Event(chamber, date_time, 'committee:meeting', ctty_name, location=metainf['Room:'] or "State House") event.add_source(url) chamber = "other" chambers = { "house": "lower", "joint": "joint", "senate": "upper", } for c in chambers: if c in ctty_name.lower(): chamber = chambers[c] event.add_participant('host', ctty_name, chamber=chamber) # add chair? self.save_event(event)
def scrape(self, chamber, session): cha = {"upper":"7","lower":"3","other":"4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now+timedelta(days=30)).strftime(print_format) url = event_page % (cha,start,end) page = self.lxmlize(url) committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading,content = meeting.xpath("./ul/li") who,when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") kwargs = { "location": (where or '').strip() or "unknown" } event = Event(session, when, 'committee:meeting', meeting_title, **kwargs) event.add_participant( "host", who.strip(), 'committee', chamber=chamber ) event.add_source(url) #only scraping public hearing bills for now. bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]") for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") event.add_related_bill( bill_id.strip(), description=descr.strip(), type="consideration" ) self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath( "//span[@class='heading']")[0].text_content().replace( "Hearing Notice For ", "") tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', description, location=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant('host', ctty_name, 'committee', chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() event.add_related_bill(bill_id, description=description, type='consideration') self.save_event(event)
def scrape(self, session, chambers): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(session, comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event(session, when, 'committee:meeting', name, location=where, link=notice) event.add_source(calurl) event.add_participant('host', cttie, 'committee', chamber=cttie_chamber) event.add_document("notice", notice, mimetype='application/pdf') for thing in who: event.add_participant(thing['title'], thing['name'], 'legislator', chamber=cttie_chamber) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match( r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)", when_and_where ).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" page = self.get(url).text page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue if time == "Noon": time = "12:00pm" if ':' not in time: self.warning('skipping event with invalid time: %s', time) continue when = "%s %s" % (date, time) try: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") except ValueError: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. if desc.lower().strip() in ["house convenes","senate convenes"]: continue event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. if "house" in desc.lower(): chamber = "lower" elif "senate" in desc.lower(): chamber = "upper" elif "joint" in desc.lower(): chamber = "joint" else: self.logger.warning("Event %s chamber is unknown, skipping" % desc) continue event.add_participant('host', desc, 'committee', chamber = chamber) self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [ x.text_content().strip() for x in row.xpath('td') ] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) (description, member_list, meeting_type, other) = self.parse_agenda(chamber, link) event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) event.add_participant('committee', committee) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape(self, chamber, session): year_abr = ((int(session) - 209) * 2) + 2000 self.initialize_committees(year_abr) url, db = self.get_dbf(year_abr, "AGENDAS") records = [ x.asDict() for x in db ] for record in records: if record['STATUS'] != "Scheduled": continue description = record['COMMENTS'] related_bills = [] for bill in re.findall("(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id" : "%s %s" % ( bill[0], bill[2] ), "descr": description }) date_time = "%s %s" % ( record['DATE'], record['TIME'] ) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") hr_name = self._committees[record['COMMHOUSE']] event = Event( session, date_time, 'committee:meeting', "Meeting of the %s" % ( hr_name ), location=record['LOCATION'] or "Statehouse", ) for bill in related_bills: event.add_related_bill(bill['bill_id'], description=bill['descr'], type='consideration') try: chamber = { "a" : "lower", "s" : "upper", "j" : "joint" }[record['COMMHOUSE'][0].lower()] except KeyError: chamber = "joint" event.add_participant("host", hr_name, 'committee', committee_code=record['COMMHOUSE'], chamber=chamber) event.add_source(agenda_dbf) self.save_event(event)
def scrape_meeting_notice(self, chamber, session, url): page = self.lxmlize(url) bits = page.xpath("//td[@width='96%']/table/tr") metainf = {} for bit in bits: info = bit.xpath(".//td") key = info[0].text_content().strip() val = info[1].text_content().strip() if key[-1:] == ":": key = key[:-1] metainf[key] = val date_time_lbl = "Date/Time" # 04/25/2012 03:00:00 PM fmt = "%m/%d/%Y %I:%M:%S %p" metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl], fmt) event = Event(session, metainf[date_time_lbl], "committee:meeting", "Committee Meeting", chamber=chambers[metainf['Chamber']], location=metainf['Room'], chairman=metainf['Chairman']) event.add_participant("host", metainf['Committee'], chamber=chambers[metainf['Chamber']]) event.add_source(url) agenda = page.xpath("//td[@width='96%']//font[@face='Arial']") agenda = [ a.text_content().strip() for a in agenda ] if "" in agenda: agenda.remove("") for item in agenda: string = item.split() string = string[:2] fChar = string[0][0] watch = [ "H", "S" ] if fChar in watch: try: bNo = int(string[1]) except ValueError: continue except IndexError: continue bill_id = "%s %s" % ( string[0], string[1] ) event.add_related_bill( bill_id, description=item, type="consideration" ) self.save_event(event)
def scrape_meeting_notice(self, chamber, session, url): page = self.lxmlize(url) bits = page.xpath("//td[@width='96%']/table/tr") metainf = {} for bit in bits: info = bit.xpath(".//td") key = info[0].text_content().strip() val = info[1].text_content().strip() if key[-1:] == ":": key = key[:-1] metainf[key] = val date_time_lbl = "Date/Time" # 04/25/2012 03:00:00 PM fmt = "%m/%d/%Y %I:%M:%S %p" metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl], fmt) event = Event(session, metainf[date_time_lbl], "committee:meeting", "Committee Meeting", chamber=chambers[metainf['Chamber']], location=metainf['Room'], chairman=metainf['Chairman']) event.add_participant("host", metainf['Committee'], 'committee', chamber=chambers[metainf['Chamber']]) event.add_source(url) agenda = page.xpath("//td[@width='96%']//font[@face='Arial']") agenda = [a.text_content().strip() for a in agenda] if "" in agenda: agenda.remove("") for item in agenda: string = item.split() string = string[:2] fChar = string[0][0] watch = ["H", "S"] if fChar in watch: try: bNo = int(string[1]) except ValueError: continue except IndexError: continue bill_id = "%s %s" % (string[0], string[1]) event.add_related_bill(bill_id, description=item, type="consideration") self.save_event(event)
def scrape(self, chamber, session): cha = {"upper": "7", "lower": "3", "other": "4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now + timedelta(days=30)).strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) committees = page.xpath( "//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath( "//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading, content = meeting.xpath("./ul/li") who, when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") kwargs = {"location": (where or '').strip() or "unknown"} event = Event(session, when, 'committee:meeting', meeting_title, **kwargs) event.add_participant("host", who.strip(), 'committee', chamber=chamber) event.add_source(url) #only scraping public hearing bills for now. bills = meeting.xpath( ".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]" ) for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") event.add_related_bill(bill_id.strip(), description=descr.strip(), type="consideration") self.save_event(event)
def scrape(self, chamber, session): chmbr = cal_chamber_text[chamber] tables = self.url_xpath(cal_weekly_events, "//table[@class='date-table']") for table in tables: date = table.xpath("../.")[0].getprevious().text_content() trs = table.xpath("./tr") for tr in trs: order = ["time", "chamber", "type", "agenda", "location", "video"] tds = tr.xpath("./td") metainf = {} if not tds: continue for el in range(0, len(order)): metainf[order[el]] = tds[el] if metainf['chamber'].text_content() == chmbr: self.log("Skipping event based on chamber.") continue time = metainf['time'].text_content() datetime_string = "%s %s" % (date, time) location = metainf['location'].text_content() description = metainf['type'].text_content() dtfmt = "%A, %B %d, %Y %I:%M %p" if time == 'Cancelled': self.log("Skipping cancelled event.") continue else: when = dt.datetime.strptime(datetime_string, dtfmt) event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant("host", description, 'committee', chamber=chamber) event.add_source(cal_weekly_events) agenda = metainf['agenda'].xpath(".//a") if len(agenda) > 0: agenda = agenda for doc in agenda: if not doc.text_content(): continue agenda_url = doc.attrib['href'] self.add_agenda( agenda_url, doc.text_content(), event) self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath("//span[@class='heading']")[0].text_content().replace( "Hearing Notice For ", "") tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', description, location=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant('host', ctty_name, 'committee', chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() event.add_related_bill(bill_id, description=description, type='consideration') self.save_event(event)
def scrape_event_page(self, url, chamber, session): page = self.lxmlize(url) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = { "txt": val.text_content().strip(), "obj": val } if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf['Date']['txt'], metainf['Time']['txt'] ) if "Cancelled" in datetime: return datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title event = Event(session, datetime, 'committee:meeting', title, location=where) event.add_source(url) event.add_source(mi_events) event.add_participant('host', metainf['Committee']['txt'], chamber=chamber) agenda = metainf['Agenda']['obj'] related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: event.add_related_bill( bill.text_content(), description=agenda.text_content(), type='consideration' ) self.save_event(event)
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainf = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainf[key.strip()] = val.strip() ctty = metainf['COMMITTEE'] where = metainf['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainf['PLACE'] = where.strip() metainf['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainf: chair = metainf['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(session, datetime, 'committee:meeting', ctty, chamber=chamber, location=where, agenda=plaintext) event.add_source(url) event.add_participant('host', ctty, 'committee', chamber=chamber) if not chair is None: event.add_participant('chair', chair, 'legislator', chamber=chamber) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) event.add_related_bill(bill_id, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_committee_upcoming(self, session, chamber): chamber_name = { 'upper': 'senate', 'lower': 'house', 'other': 'joint' }[chamber] url = ("http://www.capitol.state.tx.us/MyTLO/RSS/RSS.aspx?" "Type=upcomingmeetings%s" % chamber_name) with self.urlopen(url) as page: feed = feedparser.parse(page) for entry in feed['entries']: try: title, date = entry['title'].split(' - ') except ValueError: continue try: time = re.match('Time: (\d+:\d+ (A|P)M)', entry['description']).group(1) except AttributeError: # There are a few broken events in their feeds # sometimes continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') when = self._tz.localize(when) location = entry['description'].split('Location: ')[1] description = 'Committee Meeting\n' description += entry['title'] + '\n' description += entry['description'] event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant('committee', title) event['_guid'] = entry['guid'] event['link'] = entry['link'] event.add_source(url) self.save_event(event)
def scrape(self, session, chambers): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath( '//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue event = Event( session=session, when=self._TZ.localize(datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT )), type='committee:meeting', description=" ".join(x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip()), location=doc.xpath( '//div[@class="heading-container"]/span/text()') [0].title() ) event.add_participant( type='host', participant=info.xpath( 'span[@class="col01"]/text()')[0].title(), participant_type='committee' ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( name=document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) self.save_event(event)
def scrape_events(self, chamber, session, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") #some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) event_type = 'committee:meeting' kwargs = {"location": "State House"} if where is not None and where != "": kwargs['location'] = where event = Event(session, when, event_type, description, **kwargs) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant('committee', name, 'committee', chamber=chamber) event.add_source(url) self.save_event(event) else: #hack so we dont fail on the first id numbers where there are some gaps between the numbers that work and not. if event_id > 1700: raise ScrapeError( "Parsing is done we are on future ids that are not used yet." )
def scrape(self, chamber, session): year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall("(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id" : "%s %s" % ( bill[0], bill[2] ), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") hr_name = self._committees[record['CommHouse']] event = Event( session, date_time, 'committee:meeting', "Meeting of the %s" % ( hr_name ), location=record['Location'] or "Statehouse", ) for bill in related_bills: event.add_related_bill(bill['bill_id'], description=bill['descr'], type='consideration') try: chamber = { "a" : "lower", "s" : "upper", "j" : "joint" }[record['CommHouse'][0].lower()] except KeyError: chamber = "joint" event.add_participant("host", hr_name, 'committee', committee_code=record['CommHouse'], chamber=chamber) event.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_event(event)
def scrape(self, session, chambers): url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO.StringIO(page.content), delimiter='|') for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += page.next() assert (len(row) <= LINE_LENGTH, "Line is too long: {}".format(row)) desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm_chamber = {'HOUSE': 'lower', 'SENATE': 'upper'}[match.group(2)] comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue event = Event(session, when, 'committee:meeting', "%s MEETING" % comm, location=location) event.add_source(url) event.add_participant('host', comm, 'committee', chamber=comm_chamber) time = row[3].strip() if time in TIMECODES: event['notes'] = TIMECODES[time] self.save_event(event)