def scrape(self, chamber, session): url = "http://www.legislature.state.oh.us/today.cfm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"): date = td.text.strip() if chamber == 'upper': desc = td.getnext().text.strip() else: desc = td.getnext().getnext().text.strip() match = re.match(r'^Session at (\d+:\d+ [pa]\.m\.)', desc) if match: time = match.group(1) time = time.replace('a.m.', 'AM').replace('p.m.', 'PM') when = "%s 2011 %s" % (date, time) when = datetime.datetime.strptime(when, "%a. %b %d %Y %I:%M %p") when = self._tz.localize(when) chamber_name = { 'upper': 'Senate', 'lower': 'House' }[chamber] event = Event(session, when, 'floor_time', desc, "%s Chamber" % chamber_name) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): if chamber != "other": return url = "http://www.leg.state.vt.us/HighlightsMain.cfm" page = self.lxmlize(url) ps = page.xpath( "//p[@class='HighlightsNote' or @class='HighlightsDate']") events = {} event_set = [] for p in ps: if p.attrib['class'] == "HighlightsNote": event_set.append(p) else: date_time = p.text[len("Posted "):] events[date_time] = event_set event_set = [] for date in events: date_time = dt.datetime.strptime(date, "%m/%d/%Y") for event in events[date]: descr = event.text_content() e = Event(session, date_time, "other", descr, location="state house") e.add_source(url) self.save_event(e)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): guid = link.attrib['href'] committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] when = parse_datetime(when_and_where, session) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant('committee', committee) event['link'] = guid self.save_event(event)
def scrape_committee_events(self, session, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.get(events_url).text events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: if info['title'] is None: self.warning("Event found with no title; it will be skipped") continue elif info['title'].startswith('CANCELLED:'): self.info( "Cancelled event found; it will be skipped: {}".format( info['title'])) continue event = Event( session=session, when=datetime.datetime.strptime(info['start'], DATETIME_FORMAT), end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT), type='committee:meeting', description=info['title'], location="{0} {1}".format(info['building'].strip(), info['location'].strip())) event.add_source(events_url) self.save_event(event)
def scrape_upper(self, session): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'Meeting_Notice')]"): comm = link.text.strip() comm = re.sub(r'\s+', ' ', comm) if link.getnext().text == 'Cancelled': continue date_path = "../../preceding-sibling::p[@class='MsoNormal']" date = link.xpath(date_path)[-1].xpath("string()") time_loc = link.xpath("../br")[0].tail.strip() time = re.match("\d+:\d+ (am|pm)", time_loc).group(0) location = time_loc.split(', ')[1].strip() dt = "%s %s" % (date, time) dt = datetime.datetime.strptime(dt, "%A, %B %d, %Y %I:%M %p") event = Event(session, dt, 'committee:meeting', "%s Committee Meeting" % comm, location) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): bills_discussed = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue bills_discussed[(location, date)].append(hearing.bill_id) for ((location, date), bills) in bills_discussed.iteritems(): bills = [ "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bills ] desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location, ', '.join(bills)) event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location) self.save_event(event)
def scrape_event(self, chamber, session, obj): meeting = obj['data']['meeting'] date = int(meeting['meetingDateTime']) date = dt.datetime.fromtimestamp(date / 1000) if str(date.year) not in session: return description = 'Committee Meeting: ' + meeting['committeeName'] event = Event(session, date, 'committee:meeting', description=description, location=meeting['location'] or 'No location given.') event.add_source(obj['url']) event.add_participant('chair', meeting['committeeChair'], 'legislator', chamber='upper') event.add_participant('host', meeting['committeeName'], 'committee', chamber='upper') rgx = r'([a-z]+)(\d+)' for bill in meeting['bills']: raw_id = bill['senateBillNo'] bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups()) event.add_related_bill(bill_id, type='bill', description=bill['summary'] or 'No description given.') return event
def scrape_lower_events(self, session): url = "http://assembly.state.ny.us/leg/?sh=hear" year = datetime.date.today().year with self.urlopen(url) as page: page = lxml.html.fromstring(page) for td in page.xpath("//td[@bgcolor='#99CCCC']"): desc = td.xpath("string(following-sibling::td/strong)") if "Senate Standing Committee" in desc: # We should pick these up from the upper scraper continue notes = td.xpath("string(../following-sibling::tr[1]/td[2])") notes = re.sub(r"\*\*Click here to view hearing notice\*\*", "", notes).strip() location = td.xpath("string(../following-sibling::tr[2]/td[2])") date = " ".join(td.text.split()[0:2]).strip() time = td.xpath("../following-sibling::tr[3]/td[2]")[0] split_time = time.text.split("-") when = "%s %d %s" % (date, year, split_time[0].strip()) when = _parse_date(when.replace(".", "")) end = None if len(split_time) > 1: end = "%s %d %s" % (date, year, split_time[1].strip()) end = _parse_date(end.replace(".", "")) event = Event(session, when, "committee:meeting", desc, location, end=end, notes=notes) event.add_source(url) self.save_event(event)
def scrape_upper(self, session): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split('MEETING NOTICES') re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}' chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1) time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM') time_ = time.strptime(time_, '%I:%M %p') when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1) where = where.strip() event = Event(session, when, 'committee:meeting', title, location=where) event.add_source(url) self.save_event(event)
def actions_to_events(state): for bill in db.bills.find({'state': state}): print "Converting %s actions to events" % bill['_id'] count = 1 for action in bill['actions']: guid = "%s:action:%06d" % (bill['_id'], count) count += 1 event = db.events.find_one({'state': state, '_guid': guid}) description = "%s: %s" % (bill['bill_id'], action['action']) data = Event(bill['session'], action['date'], 'bill:action', description, location=action['actor'], action_type=action['type']) data.add_participant('actor', action['actor']) data['_guid'] = guid data['state'] = state if not event: data['created_at'] = datetime.datetime.utcnow() data['updated_at'] = data['created_at'] _insert_with_id(data) else: update(event, data, db.events)
def scrape(self, chamber, session): if chamber == 'upper': url = ("http://www.nysenate.gov/calendar/ical/" "senator%3DAll%2526type%3D3%2526committee%3DAll" "%2526initiative%3DAll") else: return with self.urlopen(url) as page: cal = icalendar.Calendar.from_string(page) for comp in cal.walk(): if comp.name != 'VEVENT': continue text = str(comp['SUMMARY']) if 'Committee Meeting' not in text: continue start = comp['DTSTART'].dt end = comp['DTEND'].dt uid = str(comp['UID']) event_url = comp['URL'] location = self.get_upper_location(event_url) print location event = Event(session, start, 'committee:meeting', text, location, end) event.add_source(url) event.add_source(event_url) self.save_event(event)
def scrape(self, chamber, session): url = "http://www.legislature.state.oh.us/today.cfm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"): date = td.text.strip() if chamber == 'upper': desc = td.getnext().text.strip() else: desc = td.getnext().getnext().text.strip() match = re.match(r'^Session at (\d+:\d+ [pa]\.m\.)', desc) if match: time = match.group(1) time = time.replace('a.m.', 'AM').replace('p.m.', 'PM') when = "%s 2011 %s" % (date, time) when = datetime.datetime.strptime(when, "%a. %b %d %Y %I:%M %p") when = self._tz.localize(when) chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber] event = Event(session, when, 'floor_time', desc, "%s Chamber" % chamber_name) event.add_source(url) self.save_event(event)
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: kwargs = {} if group in self.metadata["chambers"].keys(): kwargs["chamber"] = group agenda = self.get_agenda(meeting) if agenda: kwargs["agenda"] = agenda # Event prototype is as follows: # class Event(SourcedObject): # def __init__(self, session, when, type, # description, location, end=None, **kwargs) event = Event(self.session, when, "committee:meeting", description, location, **kwargs) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): if session != '2011 Regular Session': raise NoDataForPeriod(session) url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): r = requests.get("http://google.sannet.gov/search?num=100&requiredfields=PATH:councildockets|PATH:councilminutes|PATH:councilresults&getfields=DOCUMENT_URL.DOC_DATE.TITLE.SORTORDER&sort=date:D:S:d1&output=xml_no_dtd&ie=UTF-8&client=scs_ocd&filter=0&site=documents&config=sirecouncilmeetings.js&proxystylesheet=sirefrontend&q=Council+inmeta:DOC_DATE_NUM:20130101..20140101") soup = BeautifulSoup(r.text) table = soup.find_all('table')[-1] rows = table.findAll('tr') for row in rows: date_cell = row.findAll('script')[0].text if date_cell.startswith('build_date_cell'): date = date_cell[17:27] link = row.find('a') url = link['href'] title = link.text when = "%s" % (date) when = datetime.datetime.strptime(when, "%Y-%M-%d") when = self._tz.localize(when) desc = title #event = div.xpath("string(span[3])").strip() # XXX: Process `event' for related bills. event = Event(session, when, 'council:meeting',desc, location=None) event.add_source(url) # desc is actually the ctty name. #event.add_participant('host', desc, 'committee', # chamber=chamber) self.save_event(event)
def scrape_upper_events(self, session): url = "http://flsenate.gov/Session/DailyCalendarRSS.cfm?format=rss" with self.urlopen(url) as page: feed = feedparser.parse(page) for entry in feed['entries']: if 'Committee' not in entry['summary']: continue date = datetime.datetime(*entry['updated_parsed'][:6]) match = re.match(r'(\d+):(\d+)', entry['title']) if match: when = datetime.datetime(date.year, date.month, date.day, int(match.group(1)), int(match.group(2)), 0) when = self._tz.localize(when) desc = entry['summary'].split(' - ')[0] location = entry['summary'].split(' - ')[1] event = Event(session, when, 'committee:meeting', desc, location) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return # XXX: Change to invocation? if chamber == "lower": self.house_meetings() url = "http://www.legislature.state.oh.us/today.cfm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for td in page.xpath("//td[@bgcolor='FFEAD5' and @height='25']"): date = td.text.strip() if chamber == "upper": desc = td.getnext().text.strip() else: desc = td.getnext().getnext().text.strip() match = re.match(r"^Session at (\d+:\d+ [pa]\.m\.)", desc) if match: time = match.group(1) time = time.replace("a.m.", "AM").replace("p.m.", "PM") when = "%s 2011 %s" % (date, time) when = datetime.datetime.strptime(when, "%a. %b %d %Y %I:%M %p") when = self._tz.localize(when) chamber_name = {"upper": "Senate", "lower": "House"}[chamber] event = Event(session, when, "floor_time", desc, "%s Chamber" % chamber_name) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): bills_discussed = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue bills_discussed[(location, date)].append(hearing.bill_id) for ((location, date), bills) in bills_discussed.iteritems(): bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bills] desc = 'Committee Meeting\n%s\nDiscussed: %s' % (location, ', '.join(bills)) event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location, 'committee') self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.urlopen(url) page = csv.reader(StringIO.StringIO(page.bytes), delimiter='|') for row in page: desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm_chamber = {'HOUSE': 'lower', 'SENATE': 'upper'}[match.group(2)] if comm_chamber != chamber: continue comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') event = Event(session, when, 'committee:meeting', "%s MEETING" % comm, location=location) event.add_source(url) event.add_participant('host', comm, chamber=chamber) time = row[3].strip() if time in TIMECODES: event['notes'] = TIMECODES[time] self.save_event(event)
def scrape_committee_events(self, session, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.get(events_url).text events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: if info['title'] is None: self.warning("Event found with no title; it will be skipped") continue elif info['title'].startswith('CANCELLED:'): self.info("Cancelled event found; it will be skipped: {}". format(info['title'])) continue event = Event( session=session, when=datetime.datetime.strptime(info['start'], DATETIME_FORMAT), end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT), type='committee:meeting', description=info['title'], location="{0} {1}".format(info['building'].strip(), info['location'].strip()) ) event.add_source(events_url) self.save_event(event)
def scrape_upper(self, session): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) text = page.text_content() _, text = text.split('MEETING NOTICES') re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}' chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1) time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM') time_ = time.strptime(time_, '%I:%M %p') when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1) where = where.strip() event = Event(session, when, 'committee:meeting', title, location=where) event.add_source(url) self.save_event(event)
def scrape_page(self, url, chamber, session): page = self.lxmlize(url) info_blocks = { "canceled": "//div[@class='cancelled']", "committee": "//div[@class='titlemeetingtype']", "chamber": "//div[@class='titlehouse']", "datetime": "//div[@class='datetimelocation']" } metainf = {} for block in info_blocks: info = page.xpath(info_blocks[block]) if info == []: continue metainf[block] = { "obj": info[0], "txt": info[0].text_content() } if 'committee' not in metainf: return if 'canceled' in metainf: return obj = metainf['datetime']['obj'] dates = obj.xpath("./*") date_time = obj.text.strip() for date in dates: if date.tail is not None: date_time += " %s" % (date.tail.strip()) # Wednesday, May 23, 2012 10:00 AM 417 North (GAR Hall) State Capitol splits = [ 'AM', 'PM' ] date_times = None for split in splits: if split in date_time: date_times = [ x.strip() for x in date_time.split(split, 1) ] date_times[0] += " " + split time = date_times[0] place = date_times[1] committee = metainf['committee']['txt'] chamber = metainf['chamber']['txt'] try: chamber = { "Senate": "upper", "Assembly": "lower", "Joint": "joint" }[chamber] except KeyError: chamber = 'other' # Wednesday, May 23, 2012 10:00 AM datetime = dt.datetime.strptime(time, "%A, %B %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', committee, location=place) event.add_participant('host', committee, 'committee', chamber=chamber) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): seen = set() for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue if (location, date) in seen: continue seen.add((location, date)) desc = 'Committee Meeting\n%s' % location event = Event(session, date, 'committee:meeting', desc, location=location) event.add_participant('committee', location) self.save_event(event)
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: kwargs = {} if group in self.metadata['chambers'].keys(): kwargs['chamber'] = group agenda = self.get_agenda(meeting) if agenda: kwargs['agenda'] = agenda # Event prototype is as follows: # class Event(SourcedObject): # def __init__(self, session, when, type, # description, location, end=None, **kwargs) event = Event(self.session, when, 'committee:meeting', description, location, **kwargs) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.urlopen(url) page = csv.reader(StringIO.StringIO(page.bytes), delimiter="|") for row in page: desc = row[7].strip() match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc) if match: comm_chamber = {"HOUSE": "lower", "SENATE": "upper"}[match.group(2)] if comm_chamber != chamber: continue comm = match.group(1).strip() comm = re.sub(r"\s+", " ", comm) location = row[5].strip() or "Unknown" when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S") event = Event(session, when, "committee:meeting", "%s MEETING" % comm, location=location) event.add_source(url) event.add_participant("committee", comm, chamber=chamber) time = row[3].strip() if time in TIMECODES: event["notes"] = TIMECODES[time] self.save_event(event)
def scrape_committee_events(self, session, code, name): url = "http://www.cga.ct.gov/asp/menu/" "CGACommCal.asp?comm_code=%s" % code with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) cal_table = page.xpath("//table[contains(@summary, 'Calendar')]")[0] date_str = None for row in cal_table.xpath("tr[2]//tr"): col1 = row.xpath("string(td[1])").strip() col2 = row.xpath("string(td[2])").strip() if not col1: if col2 == "No Meetings Scheduled": return # If col1 is empty then this is a date header date_str = col2 else: # Otherwise, this is a committee event row when = date_str + " " + col1 when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p") when = self._tz.localize(when) location = row.xpath("string(td[3])").strip() guid = row.xpath("td/a")[0].attrib["href"] event = Event(session, when, "committee meeting", col2, location, _guid=guid) event.add_source(url) event.add_participant("committee", name, chamber="joint") self.save_event(event)
def scrape_upper_events(self, session): url = ( "http://www.nysenate.gov/calendar/ical/" "senator%3DAll%2526type%3D3%2526committee%3DAll" "%2526initiative%3DAll" ) with self.urlopen(url) as page: cal = icalendar.Calendar.from_string(page) for comp in cal.walk(): if comp.name != "VEVENT": continue text = str(comp["SUMMARY"]) if "Committee Meeting" not in text: continue start = _tz.localize(comp["DTSTART"].dt) end = _tz.localize(comp["DTEND"].dt) uid = str(comp["UID"]) event_url = comp["URL"] location = self.get_upper_location(event_url) event = Event(session, start, "committee:meeting", text, location, end) event.add_source(url) event.add_source(event_url) self.save_event(event)
def scrape(self, chamber, session): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. event.add_participant('host', desc, 'committee', chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): if session != '27': raise NoDataForPeriod(session) if chamber == 'other': return year, year2 = None, None for term in self.metadata['terms']: if term['sessions'][0] == session: year = str(term['start_year']) year2 = str(term['end_year']) break # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year[2:] url = ("http://www.legis.state.ak.us/basis/" "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&" "Comty=&Root=&Sel=1&Button=Display" % (session, date1, date2)) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]" for font in page.xpath(path): match = re.match(r'^\((H|S)\)(.+)$', font.text) chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)] comm = match.group(2).strip().title() next_row = font.xpath("../../following-sibling::tr[1]")[0] when = next_row.xpath("string(td[1]/font)").strip() when = datetime.datetime.strptime(when + " " + year, "%b %d %A %I:%M %p %Y") when = self._tz.localize(when) where = next_row.xpath("string(td[2]/font)").strip() description = "Committee Meeting\n" description += comm links = font.xpath( "../../td/font/a[contains(@href, 'get_documents')]") if links: agenda_link = links[0] print agenda_link event['link'] = agenda_link.attrib['href'] event = Event(session, when, 'committee:meeting', description, location=where) event.add_source(url) self.save_event(event)
def scrape(self, chamber, session): if chamber != "other": return None page = self.lxmlize(url) meetings = page.xpath("//div[@class='Comm_item']") for meeting in meetings: metas = meeting.xpath(".//b") ctty = meeting.xpath(".//a")[0] ctty_name = ctty.text_content() info = metas[1:] datetime = metas[0] metainf = {} for meta in info: header = meta.text_content().strip() val = meta.tail metainf[header] = val or "" datetime = datetime.text_content().strip() # Tuesday, June 05, 2012 9:00 AM if "Canceled" in datetime: continue formats = [ "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y" ] date_time = None for fmt in formats: try: date_time = dt.datetime.strptime( datetime, fmt) except ValueError: pass if date_time is None: continue event = Event(chamber, date_time, 'committee:meeting', ctty_name, location=metainf['Room:'] or "State House" ) event.add_source(url) chamber = "other" chambers = { "house": "lower", "joint": "joint", "senate": "upper", } for c in chambers: if c in ctty_name.lower(): chamber = chambers[c] event.add_participant('host', ctty_name, chamber=chamber) # add chair? self.save_event(event)
def scrape(self, chamber, session): start_date = "%s-01-10T00:00:00" % session[0:4] end_date = "%d-01-10T00:00:00" % (int(session[5:10]) + 1) url = ("http://wslwebservices.leg.wa.gov/CommitteeMeetingService" x ".asmx/GetCommitteeMeetings?beginDate=%s" "&endDate=%s" % (start_date, end_date)) expected_agency = {'upper': 'Senate', 'lower': 'House'}[chamber] with self.urlopen(url) as page: page = lxml.etree.fromstring(page) for meeting in page.xpath( "//wa:CommitteeMeeting", namespaces=self._ns): cancelled = meeting.xpath( "string(wa:Cancelled)", namespaces=self._ns).strip() if cancelled.lower() == "true": continue agency = meeting.xpath( "string(wa:Agency)", namespaces=self._ns).strip() if agency != expected_agency: continue dt = meeting.xpath("string(wa:Date)", namespaces=self._ns) dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S") room = meeting.xpath("string(wa:Room)", namespaces=self._ns) building = meeting.xpath( "string(wa:Building)", namespaces=self._ns) location = "%s, %s" % (room, building) comm = meeting.xpath( "string(wa:Committees/wa:Committee[1]/wa:Name)", namespaces=self._ns) desc = "Committee Meeting\n%s" % comm guid = meeting.xpath( "string(wa:AgendaId)", namespaces=self._ns) event = Event(session, dt, 'committee:meeting', desc, location=location, _guid=guid) for comm_part in meeting.xpath( "wa:Committees/wa:Committee", namespaces=self._ns): name = comm_part.xpath("string(wa:Name)", namespaces=self._ns) agency = comm_part.xpath("string(wa:Agency)", namespaces=self._ns) name = "%s %s Committee" % (agency, name) event.add_participant('committee', name) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)", when_and_where).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def parse_row(self, row, session, chamber): dates = row.xpath("./td[@class='dateCell']") for date in dates: # alright, so we *may* not get a date, in which case the date # is the same as the last event. cal_date = date.xpath("./span[@class='calendarMonth']")[0] cal_day = date.xpath("./span[@class='calendarDay']")[0] self.last_month = cal_date.text_content() self.last_day = cal_day.text_content() time = row.xpath("./td[@class='timeCell']") if not time: return # Nada. time = time[0] time = time.text.strip() dt_string = "%s %s %s %s" % ( self.last_month, self.last_day, self.year, time ) fmt = "%b %d %Y %I:%M %p" when = dt.datetime.strptime(dt_string, fmt) cells = { "event": "eventCell", "status": "statusCell", "location": "locationCell", "transcript": "transcriptCell", "video": "videoCell" } metainf = {} for thing in cells: mi = row.xpath("./td[@class='" + cells[thing] + "']") if mi == []: continue metainf[thing] = mi[0] if metainf['location'].xpath("./*") == []: metainf['location'] = self.last_location else: self.last_location = metainf['location'] if "Session" in metainf['event'].text_content().strip(): return # Nada. loc_url = metainf['location'].xpath(".//a") loc_url = loc_url[0].attrib['href'] event = Event(session, when, 'committee:meeting', metainf['event'].text_content().strip(), chamber=chamber, location=metainf['location'].text_content().strip(), location_url=loc_url) event.add_participant("host", metainf['event'].text_content().strip(), 'committee', chamber=chamber) self.add_agenda(event, metainf['event'] .xpath(".//a")[0].attrib['href']) return event
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [ committee, ] for committee in committees: if "INFO" not in committee: committee = self.short_ids[committee] else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill(bill['bill_id'], description=bill['descr'], type=bill['type']) self.save_event(event)
def parse_row(self, row, session, chamber): dates = row.xpath("./td[@class='dateCell']") for date in dates: # alright, so we *may* not get a date, in which case the date # is the same as the last event. cal_date = date.xpath("./span[@class='calendarMonth']")[0] cal_day = date.xpath("./span[@class='calendarDay']")[0] self.last_month = cal_date.text_content() self.last_day = cal_day.text_content() time = row.xpath("./td[@class='timeCell']") if not time: return # Nada. time = time[0] time = time.text.strip() dt_string = "%s %s %s %s" % ( self.last_month, self.last_day, self.year, time ) fmt = "%b %d %Y %I:%M %p" when = dt.datetime.strptime(dt_string, fmt) cells = { "event": "eventCell", "status": "statusCell", "location": "locationCell", "transcript": "transcriptCell", "video": "videoCell" } metainf = {} for thing in cells: mi = row.xpath("./td[@class='" + cells[thing] + "']") if mi == []: continue metainf[thing] = mi[0] if metainf['location'].xpath("./*") == []: metainf['location'] = self.last_location else: self.last_location = metainf['location'] if "Session" in metainf['event'].text_content().strip(): return # Nada. loc_url = metainf['location'].xpath(".//a") loc_url = loc_url[0].attrib['href'] event = Event(session, when, 'committee:meeting', metainf['event'].text_content().strip(), chamber=chamber, location=metainf['location'].text_content().strip(), location_url=loc_url) event.add_participant("host", metainf['event'].text_content().strip(), 'committee', chamber=chamber) self.add_agenda(event, metainf['event'].xpath(".//a")[0].attrib['href']) return event
def scrape(self, chamber, session): chmbr = cal_chamber_text[chamber] tables = self.url_xpath(cal_weekly_events, "//table[@class='date-table']") for table in tables: date = table.xpath("../.")[0].getprevious().text_content() trs = table.xpath("./tr") for tr in trs: order = ["time", "chamber", "type", "agenda", "location", "video"] tds = tr.xpath("./td") metainf = {} if not tds: continue for el in range(0, len(order)): metainf[order[el]] = tds[el] if metainf['chamber'].text_content() == chmbr: self.log("Skipping event based on chamber.") continue time = metainf['time'].text_content() datetime_string = "%s %s" % (date, time) location = metainf['location'].text_content() description = metainf['type'].text_content() dtfmt = "%A, %B %d, %Y %I:%M %p" if time == 'Cancelled': self.log("Skipping cancelled event.") continue else: if ' Immediately follows' in datetime_string: datetime_string, _ = datetime_string.split( 'Immediately follows') datetime_string = datetime_string.strip() dtfmt = "%A, %B %d, %Y" when = dt.datetime.strptime(datetime_string, dtfmt) event = Event(session, when, 'committee:meeting', description, location=location) event.add_participant( "host", description, 'committee', chamber=chamber) event.add_source(cal_weekly_events) agenda = metainf['agenda'].xpath(".//a") if len(agenda) > 0: agenda = agenda for doc in agenda: if not doc.text_content(): continue agenda_url = doc.attrib['href'] self.add_agenda( agenda_url, doc.text_content(), event) self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == "upper": chamber_abbrev = "S" else: chamber_abbrev = "H" url = ( "http://www.legis.iowa.gov/Schedules/meetingsList" "Chamber.aspx?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % ( chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year, ) ) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for link in page.xpath("//a[contains(@id, 'linkCommittee')]"): comm = link.text.strip() desc = comm + " Committee Hearing" location = link.xpath("string(../../td[3])") when = link.xpath("string(../../td[1])").strip() if when == "Cancelled" or "Upon" in when: continue if "To Be Determined" in when: continue if "AM" in when: when = when.split("AM")[0] + " AM" else: when = when.split("PM")[0] + " PM" junk = ["Reception"] for key in junk: when = when.replace(key, "") when = re.sub("\s+", " ", when).strip() when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, "committee:meeting", desc, location) event.add_source(url) event.add_participant("host", comm, "committee", chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): if session != '27': raise NoDataForPeriod(session) if chamber == 'other': return year, year2 = None, None for term in self.metadata['terms']: if term['sessions'][0] == session: year = str(term['start_year']) year2 = str(term['end_year']) break # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year[2:] url = ("http://www.legis.state.ak.us/basis/" "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&" "Comty=&Root=&Sel=1&Button=Display" % ( session, date1, date2)) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]" for font in page.xpath(path): match = re.match(r'^\((H|S)\)(.+)$', font.text) chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)] comm = match.group(2).strip().title() next_row = font.xpath("../../following-sibling::tr[1]")[0] when = next_row.xpath("string(td[1]/font)").strip() when = datetime.datetime.strptime(when + " " + year, "%b %d %A %I:%M %p %Y") when = self._tz.localize(when) where = next_row.xpath("string(td[2]/font)").strip() description = "Committee Meeting\n" description += comm links = font.xpath( "../../td/font/a[contains(@href, 'get_documents')]") if links: agenda_link = links[0] print agenda_link event['link'] = agenda_link.attrib['href'] event = Event(session, when, 'committee:meeting', description, location=where) event.add_source(url) self.save_event(event)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']")[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmt = "%A, %B %d, %Y" if time in all_day: datetime = date else: fmt += " %I:%M %p" datetime = "%s %s" % ( date, time ) datetime = dt.datetime.strptime(datetime, fmt) event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, chamber=chamber) self.save_event(event)
def test_event(): e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting', 'event description', 'event location') e.add_document('agenda', 'http://example.com/event/agenda.txt') e.add_related_bill('HB 1', relation='considered') assert_equal(e['documents'], [{'name': 'agenda', 'url': 'http://example.com/event/agenda.txt', 'type': 'other'}]) assert_equal(e['related_bills'], [{'bill_id': 'HB 1', 'relation': 'considered'}])
def scrape(self, session, chambers): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(session, comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event(session, when, 'committee:meeting', name, location=where, link=notice) event.add_source(calurl) event.add_participant('host', cttie, 'committee', chamber=cttie_chamber) event.add_document("notice", notice, mimetype='application/pdf') for thing in who: event.add_participant(thing['title'], thing['name'], 'legislator', chamber=cttie_chamber) self.save_event(event)
def scrape(self, session, chambers): url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" page = self.get(url).text page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue if time == "Noon": time = "12:00pm" if ':' not in time: self.warning('skipping event with invalid time: %s', time) continue when = "%s %s" % (date, time) try: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") except ValueError: when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M %p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() agenda = div.xpath("string(span[3])").strip() # XXX: Process `agenda' for related bills. if desc.lower().strip() in ["house convenes","senate convenes"]: continue event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) # desc is actually the ctty name. if "house" in desc.lower(): chamber = "lower" elif "senate" in desc.lower(): chamber = "upper" elif "joint" in desc.lower(): chamber = "joint" else: self.logger.warning("Event %s chamber is unknown, skipping" % desc) continue event.add_participant('host', desc, 'committee', chamber = chamber) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainf = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainf[key.strip()] = val.strip() ctty = metainf['COMMITTEE'] where = metainf['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainf['PLACE'] = where.strip() metainf['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainf: chair = metainf['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(session, datetime, 'committee:meeting', ctty, chamber=chamber, location=where, agenda=plaintext) event.add_source(url) event.add_participant('host', ctty, 'committee', chamber=chamber) if not chair is None: event.add_participant('chair', chair, 'legislator', chamber=chamber) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) event.add_related_bill(bill_id, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath( '//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath( '//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [x.text_content().strip() for x in row.xpath('td')] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) (description, member_list, meeting_type, other) = self.parse_agenda(chamber, link) event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) event.add_participant('committee', committee) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape(self, session, chambers): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath( '//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue event = Event( session=session, when=self._TZ.localize(datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT )), type='committee:meeting', description=" ".join(x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip()), location=doc.xpath( '//div[@class="heading-container"]/span/text()') [0].title() ) event.add_participant( type='host', participant=info.xpath( 'span[@class="col01"]/text()')[0].title(), participant_type='committee' ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( name=document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [ row for row in meeting_rows if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content() ] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src, ' '"PDF-AGENDA.png")]]/@href')[0] self.logger.debug(guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ( [s.strip() for s in meeting_string.split(',') if s] + [None] * 3)[:3] self.logger.debug(location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) self.logger.debug(description) event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee_name, 'committee', chamber='lower') event.add_document('Agenda', guid, type='agenda', mimetype='application/pdf') event['link'] = guid self.save_event(event)
def scrape_upper_events(self, session): url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar" page = self.get(url).text feed = feedparser.parse(page) for entry in feed['entries']: #The feed breaks the RSS standard by making the pubdate the actual event's date, not the RSS item publish date when = datetime.datetime(*entry['published_parsed'][:6]) desc = entry['summary'].split(' - ')[0] location = entry['summary'].split(' - ')[1] event = Event(session, when, 'committee:meeting', desc, location) event.add_source(entry['link']) self.save_event(event)
def process_event(self, data): session = self.metadata['terms'][-1]['name'] event = Event(session=session, when=parse_datetime(data['start_time']), type='committee:meeting', description=data['description'], timezone=data['timezone'], location=data['location']['name'], end=data['end_time']) # TODO: participants, documents, related_bills for source in data['sources']: event.add_source(source['url']) self.save_event(event)
def scrape(self, chamber, session): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.iteritems(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event(session, date, 'committee:meeting', desc, location=committee_name) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' event.add_related_bill(bill_id, type=type_, description='consideration') event.add_participant('host', committee_name + ' Committee', 'committee', chamber=chamber) event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') self.save_event(event)
def scrape_lower_events(self, session): url = "http://assembly.state.ny.us/leg/?sh=hear" year = datetime.date.today().year with self.urlopen(url) as page: page = lxml.html.fromstring(page) for td in page.xpath("//td[@bgcolor='#99CCCC']"): desc = td.xpath("string(following-sibling::td/strong)") if 'Senate Standing Committee' in desc: # We should pick these up from the upper scraper continue notes = td.xpath("string(../following-sibling::tr[1]/td[2])") notes = re.sub(r'\*\*Click here to view hearing notice\*\*', '', notes).strip() location = td.xpath( "string(../following-sibling::tr[2]/td[2])") date = ' '.join(td.text.split()[0:2]).strip() time = td.xpath("../following-sibling::tr[3]/td[2]")[0] split_time = time.text.split('-') when = "%s %d %s" % (date, year, split_time[0].strip()) when = _parse_date(when.replace('.', '')) end = None if len(split_time) > 1: end = "%s %d %s" % (date, year, split_time[1].strip()) end = _parse_date(end.replace('.', '')) event = Event(session, when, 'committee:meeting', desc, location, end=end, notes=notes) event.add_source(url) self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath( "//span[@class='heading']")[0].text_content().replace( "Hearing Notice For ", "") tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', description, location=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant('host', ctty_name, 'committee', chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() event.add_related_bill(bill_id, description=description, type='consideration') self.save_event(event)
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: (title, when) = info.xpath('title/text()')[0].split(" - ") if not when.endswith(session[ :len("20XX")]): continue event = Event( session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) doc = self.lxmlize(url) event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant( type='host', participant=committee_name, participant_type='committee' ) documents = doc.xpath('.//td') for document in documents: event.add_document( name=document.xpath('text()')[0], url=re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]).group(1), mimetype='application/pdf' ) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'upper': url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM" else: url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for date_td in page.xpath("//td[@valign='middle']"): date = date_td.text.strip() datetime.datetime.strptime(date_td.text.strip(), "%A, %B %d, %Y").date() next_tr = date_td.getparent().getnext() while next_tr is not None: if next_tr.xpath("td[@valign='middle']"): break time = next_tr.xpath("string(td[1])").strip() dt = "%s %s" % (date, time) try: dt = datetime.datetime.strptime( dt, "%A, %B %d, %Y %I:%M %p") dt = self._tz.localize(dt) except ValueError: break desc = next_tr.xpath("string(td[2])").strip() desc = re.sub(r'\s+', ' ', desc) location = next_tr.xpath("string(td[3])").strip() location = re.sub(r'\s+', ' ', location) event = Event(session, dt, 'committee:meeting', desc, location) event.add_source(url) self.save_event(event) next_tr = next_tr.getnext()
def scrape_committee_events(self, session, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.urlopen(events_url) events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: event = Event( session=session, when=datetime.datetime.strptime(info['start'], DATETIME_FORMAT), end=datetime.datetime.strptime(info['end'], DATETIME_FORMAT), type='committee:meeting', description=info['title'], location="{0} {1}".format(info['building'].strip(), info['location'].strip()) ) event.add_source(events_url) self.save_event(event)