def scrape_page(self, url, session, chamber): try: page = self.lxmlize(url) except lxml.etree.XMLSyntaxError: self.warning("Ugh. Invalid HTML") return # Ugh, invalid HTML. agendas = page.xpath("//td[@class='numberspace']") spans = page.xpath("//center/span") ctty = None date = None time = None if len(spans) >= 4: ctty = spans[0].text_content().strip() date = spans[2].text_content().strip() time = spans[3].text_content().strip() bills = [] for agenda in agendas: number = agenda.text_content() string = agenda.getnext().text_content().strip() re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string) for bill in re_bills: bill_id = "%s%s %s" % bill bills.append({"name": bill_id, "desc": string}) if ctty is None or date is None or time is None: return datetime = "%s %s" % (date.strip(), time.strip()) datetime = re.sub("AGENDA", "", datetime).strip() datetime = [x.strip() for x in datetime.split("\r\n")] if "" in datetime: datetime.remove("") if len(datetime) == 1: datetime.append("state house") where = datetime[1] translate = {"a.m.": "AM", "p.m.": "PM"} for t in translate: datetime[0] = datetime[0].replace(t, translate[t]) datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p") chamber = "other" cLow = ctty.lower() if "seante" in cLow: chamber = "upper" elif "house" in cLow: chamber = "lower" elif "joint" in cLow: chamber = "joint" event = Event(session, datetime, "committee:meeting", ctty, location=where) event.add_source(url) event.add_participant("host", ctty, "committee", chamber=chamber) for bill in bills: event.add_related_bill(bill["name"], description=bill["desc"], type="consideration") self.save_event(event)
def scrape_event(self, chamber, session, obj): meeting = obj['data']['meeting'] date = int(meeting['meetingDateTime']) date = dt.datetime.fromtimestamp(date / 1000) if str(date.year) not in session: return description = 'Committee Meeting: ' + meeting['committeeName'] event = Event(session, date, 'committee:meeting', description=description, location=meeting['location'] or 'No location given.') event.add_source(obj['url']) event.add_participant('chair', meeting['committeeChair'], 'legislator', chamber='upper') event.add_participant('host', meeting['committeeName'], 'committee', chamber='upper') rgx = r'([a-z]+)(\d+)' for bill in meeting['bills']: raw_id = bill['senateBillNo'] bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups()) event.add_related_bill(bill_id, type='bill', description=bill['summary'] or 'No description given.') return event
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] if not when.endswith(session[:len("20XX")]): continue event = Event(session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol') event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) doc = self.lxmlize(url) event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() if committee_name.lower().startswith("Senate"): chamber = "upper" elif committee_name.lower().startswith("House"): chamber = "lower" else: chamber = "joint" event.add_participant(type='host', participant=committee_name, participant_type='committee', chamber=chamber) documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document(name=document.xpath('text()')[0], url=url, mimetype='application/pdf') bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] event.add_related_bill( bill_name, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)", when_and_where).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']")[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmt = "%A, %B %d, %Y" if time in all_day: datetime = date else: fmt += " %I:%M %p" datetime = "%s %s" % ( date, time ) datetime = dt.datetime.strptime(datetime, fmt) event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, chamber=chamber) self.save_event(event)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [ committee, ] for committee in committees: if "INFO" not in committee: committee = self.short_ids[committee] else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill(bill['bill_id'], description=bill['descr'], type=bill['type']) self.save_event(event)
def scrape(self, session, chambers): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join([place_and_time[0].strip(),place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" #when is now of the following format: #Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath(".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+'," ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event(session,when,event_type,description,location) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: #no bills continue bill = bill[0] bill_desc = b.text_content().replace(bill,"").strip(", ").strip() ses,num = bill.split("-") bill = ses.replace(" ","")+"-"+num.zfill(4) if "PR" in bill or "CER" in bill: e.add_related_bill(bill,type="resolution",description=bill_desc) else: e.add_related_bill(bill,type="bill",description=bill_desc) e.add_source(calendar_url) if committee: e.add_participant("host", committee, 'committee', chamber="upper") self.save_event(e)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [committee,] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill( bill['bill_id'], description=bill['descr'], type=bill['type'] ) self.save_event(event)
def scrape(self, chamber, session): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.iteritems(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event(session, date, 'committee:meeting', desc, location=committee_name) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' event.add_related_bill(bill_id, type=type_, description='consideration') event.add_participant('host', committee_name + ' Committee', 'committee', chamber=chamber) event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') self.save_event(event)
def test_event(): e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting', 'event description', 'event location') e.add_document('agenda', 'http://example.com/event/agenda.txt') e.add_related_bill('HB 1', relation='considered') assert_equal(e['documents'], [{'name': 'agenda', 'url': 'http://example.com/event/agenda.txt', 'type': 'other'}]) assert_equal(e['related_bills'], [{'bill_id': 'HB 1', 'relation': 'considered'}])
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match( r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)", when_and_where ).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath( "//span[@class='heading']")[0].text_content().replace( "Hearing Notice For ", "") tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', description, location=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant('host', ctty_name, 'committee', chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() event.add_related_bill(bill_id, description=description, type='consideration') self.save_event(event)
def scrape(self, chamber, session): cha = {"upper":"7","lower":"3","other":"4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now+timedelta(days=30)).strftime(print_format) url = event_page % (cha,start,end) page = self.lxmlize(url) committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading,content = meeting.xpath("./ul/li") who,when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") kwargs = { "location": (where or '').strip() or "unknown" } event = Event(session, when, 'committee:meeting', meeting_title, **kwargs) event.add_participant( "host", who.strip(), 'committee', chamber=chamber ) event.add_source(url) #only scraping public hearing bills for now. bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]") for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") event.add_related_bill( bill_id.strip(), description=descr.strip(), type="consideration" ) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_meeting_notice(self, chamber, session, url): page = self.lxmlize(url) bits = page.xpath("//td[@width='96%']/table/tr") metainf = {} for bit in bits: info = bit.xpath(".//td") key = info[0].text_content().strip() val = info[1].text_content().strip() if key[-1:] == ":": key = key[:-1] metainf[key] = val date_time_lbl = "Date/Time" # 04/25/2012 03:00:00 PM fmt = "%m/%d/%Y %I:%M:%S %p" metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl], fmt) event = Event(session, metainf[date_time_lbl], "committee:meeting", "Committee Meeting", chamber=chambers[metainf['Chamber']], location=metainf['Room'], chairman=metainf['Chairman']) event.add_participant("host", metainf['Committee'], 'committee', chamber=chambers[metainf['Chamber']]) event.add_source(url) agenda = page.xpath("//td[@width='96%']//font[@face='Arial']") agenda = [a.text_content().strip() for a in agenda] if "" in agenda: agenda.remove("") for item in agenda: string = item.split() string = string[:2] fChar = string[0][0] watch = ["H", "S"] if fChar in watch: try: bNo = int(string[1]) except ValueError: continue except IndexError: continue bill_id = "%s %s" % (string[0], string[1]) event.add_related_bill(bill_id, description=item, type="consideration") self.save_event(event)
def scrape(self, chamber, session): year_abr = ((int(session) - 209) * 2) + 2000 self.initialize_committees(year_abr) url, db = self.get_dbf(year_abr, "AGENDAS") records = [ x.asDict() for x in db ] for record in records: if record['STATUS'] != "Scheduled": continue description = record['COMMENTS'] related_bills = [] for bill in re.findall("(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id" : "%s %s" % ( bill[0], bill[2] ), "descr": description }) date_time = "%s %s" % ( record['DATE'], record['TIME'] ) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") hr_name = self._committees[record['COMMHOUSE']] event = Event( session, date_time, 'committee:meeting', "Meeting of the %s" % ( hr_name ), location=record['LOCATION'] or "Statehouse", ) for bill in related_bills: event.add_related_bill(bill['bill_id'], description=bill['descr'], type='consideration') try: chamber = { "a" : "lower", "s" : "upper", "j" : "joint" }[record['COMMHOUSE'][0].lower()] except KeyError: chamber = "joint" event.add_participant("host", hr_name, 'committee', committee_code=record['COMMHOUSE'], chamber=chamber) event.add_source(agenda_dbf) self.save_event(event)
def scrape_meeting_notice(self, chamber, session, url): page = self.lxmlize(url) bits = page.xpath("//td[@width='96%']/table/tr") metainf = {} for bit in bits: info = bit.xpath(".//td") key = info[0].text_content().strip() val = info[1].text_content().strip() if key[-1:] == ":": key = key[:-1] metainf[key] = val date_time_lbl = "Date/Time" # 04/25/2012 03:00:00 PM fmt = "%m/%d/%Y %I:%M:%S %p" metainf[date_time_lbl] = dt.datetime.strptime(metainf[date_time_lbl], fmt) event = Event(session, metainf[date_time_lbl], "committee:meeting", "Committee Meeting", chamber=chambers[metainf['Chamber']], location=metainf['Room'], chairman=metainf['Chairman']) event.add_participant("host", metainf['Committee'], chamber=chambers[metainf['Chamber']]) event.add_source(url) agenda = page.xpath("//td[@width='96%']//font[@face='Arial']") agenda = [ a.text_content().strip() for a in agenda ] if "" in agenda: agenda.remove("") for item in agenda: string = item.split() string = string[:2] fChar = string[0][0] watch = [ "H", "S" ] if fChar in watch: try: bNo = int(string[1]) except ValueError: continue except IndexError: continue bill_id = "%s %s" % ( string[0], string[1] ) event.add_related_bill( bill_id, description=item, type="consideration" ) self.save_event(event)
def test_event(): e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting', 'event description', 'event location') e.add_document('agenda', 'http://example.com/event/agenda.txt') e.add_related_bill('HB 1', relation='considered') assert_equal(e['documents'], [{ 'name': 'agenda', 'url': 'http://example.com/event/agenda.txt', 'type': 'other' }]) assert_equal(e['related_bills'], [{ 'bill_id': 'HB 1', 'relation': 'considered' }])
def scrape(self, chamber, session): cha = {"upper": "7", "lower": "3", "other": "4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now + timedelta(days=30)).strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) committees = page.xpath( "//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath( "//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading, content = meeting.xpath("./ul/li") who, when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") kwargs = {"location": (where or '').strip() or "unknown"} event = Event(session, when, 'committee:meeting', meeting_title, **kwargs) event.add_participant("host", who.strip(), 'committee', chamber=chamber) event.add_source(url) #only scraping public hearing bills for now. bills = meeting.xpath( ".//div[text() = 'Public Hearing']/following-sibling::li[contains(@class, 'visible-lg')]" ) for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") event.add_related_bill(bill_id.strip(), description=descr.strip(), type="consideration") self.save_event(event)
def scrape_event_page(self, url, chamber, session): page = self.lxmlize(url) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = { "txt": val.text_content().strip(), "obj": val } if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf['Date']['txt'], metainf['Time']['txt'] ) if "Cancelled" in datetime: return datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title event = Event(session, datetime, 'committee:meeting', title, location=where) event.add_source(url) event.add_source(mi_events) event.add_participant('host', metainf['Committee']['txt'], chamber=chamber) agenda = metainf['Agenda']['obj'] related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: event.add_related_bill( bill.text_content(), description=agenda.text_content(), type='consideration' ) self.save_event(event)
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainf = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainf[key.strip()] = val.strip() ctty = metainf['COMMITTEE'] where = metainf['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainf['PLACE'] = where.strip() metainf['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainf: chair = metainf['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(session, datetime, 'committee:meeting', ctty, chamber=chamber, location=where, agenda=plaintext) event.add_source(url) event.add_participant('host', ctty, 'committee', chamber=chamber) if not chair is None: event.add_participant('chair', chair, 'legislator', chamber=chamber) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) event.add_related_bill(bill_id, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath("//span[@class='heading']")[0].text_content().replace( "Hearing Notice For ", "") tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, 'committee:meeting', description, location=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant('host', ctty_name, 'committee', chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() event.add_related_bill(bill_id, description=description, type='consideration') self.save_event(event)
def scrape(self, chamber, session): year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall("(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id" : "%s %s" % ( bill[0], bill[2] ), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") hr_name = self._committees[record['CommHouse']] event = Event( session, date_time, 'committee:meeting', "Meeting of the %s" % ( hr_name ), location=record['Location'] or "Statehouse", ) for bill in related_bills: event.add_related_bill(bill['bill_id'], description=bill['descr'], type='consideration') try: chamber = { "a" : "lower", "s" : "upper", "j" : "joint" }[record['CommHouse'][0].lower()] except KeyError: chamber = "joint" event.add_participant("host", hr_name, 'committee', committee_code=record['CommHouse'], chamber=chamber) event.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_event(event)
def scrape(self, chamber, session): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.iteritems(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date),) assert len(set(hearing.committee_nr for hearing in hearings) ) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event(session, date, 'committee:meeting', desc, location=committee_name) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' event.add_related_bill(bill_id, type=type_, description='consideration') event.add_participant('host', committee_name + ' Committee', 'committee', chamber=chamber) event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') self.save_event(event)
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainf = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainf[key.strip()] = val.strip() ctty = metainf['COMMITTEE'] where = metainf['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainf['PLACE'] = where.strip() metainf['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainf: chair = metainf['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(session, datetime, 'committee:meeting', ctty, chamber=chamber, location=where, agenda=plaintext) event.add_source(url) event.add_participant('host', ctty, 'committee', chamber=chamber) if not chair is None: event.add_participant( 'chair', chair, 'legislator', chamber=chamber) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) event.add_related_bill(bill_id, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_page(self, url, session, chamber): page = self.lxmlize(url) ctty_name = page.xpath("//span[@class='heading']")[0].text_content() tables = page.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf["Location:"] description = ctty_name datetime = metainf["Scheduled Date:"] datetime = re.sub("\s+", " ", datetime) repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) datetime = dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p") event = Event(session, datetime, "committee:meeting", description, location=where) event.add_source(url) event.add_participant("host", ctty_name, chamber=chamber) bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() descr = tds[2].text_content() event.add_related_bill(bill_id, description=description, type="consideration") self.save_event(event)
def upper_scrape_event(self, chamber, session, obj): meeting = obj['data']['meeting'] date = int(meeting['meetingDateTime']) date = dt.datetime.fromtimestamp(date / 1000) if str(date.year) not in session: return description = 'Committee Meeting: ' + meeting['committeeName'] event = Event(session, date, 'committee:meeting', description=description, location=meeting['location'] or 'No location given.') event.add_source(obj['url']) event.add_participant('chair', meeting['committeeChair'], 'legislator', chamber='upper') event.add_participant('host', meeting['committeeName'], 'committee', chamber='upper') rgx = r'([a-z]+)(\d+)' for bill in meeting['bills']: raw_id = bill['senateBillNo'] bill_id = ' '.join(re.search(rgx, raw_id, re.I).groups()) event.add_related_bill( bill_id, type='bill', description=bill['summary'] or 'No description given.') return event
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] if time in all_day: datetime = date else: datetime = "%s %s" % ( date, time ) if "CANCELLED" in datetime or "Rise of the House" in datetime: # XXX: Do something more advanced. return transtable = { "P.M" : "PM", "PM." : "PM", "P.M." : "PM", "A.M." : "AM", "POSTPONED" : "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, 'committee', chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): cha = {"upper": "senate", "lower": "house", "other": "joint"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) then = now + timedelta(weeks=4) end = then.strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) def _split_tr(trs): ret = [] cur = [] for tr in trs: if len(tr.xpath(".//hr")) > 0: ret.append(cur) cur = [] continue cur.append(tr) if cur != []: ret.append(cur) return ret tables = page.xpath("//table[@class='AgendaCommittee']") for table in tables: # grab agenda, etc trs = table.xpath(".//tr") events = _split_tr(trs) for event in events: assert len(event) == 2 header = event[0] body = event[1] whowhen = header.xpath(".//h2")[0].text_content() blocks = [x.strip() for x in whowhen.rsplit("-", 1)] who = blocks[0] when = blocks[1].replace(u'\xa0', ' ') if "TBA" in when: continue # XXX: Fixme cancel = \ body.xpath(".//span[@style='color:red;font-weight:bold']") if len(cancel) > 0: cancel = True else: cancel = False descr = body.xpath(".//*") flush = False where = body.xpath(".//br")[1].tail if where is not None: where = where.strip() else: where = "unknown" kwargs = {"location": where} if cancel: kwargs['cancelled'] = cancel when = dt.datetime.strptime(when, "%m/%d/%y %I:%M %p") meeting_title = "Scheduled Meeting" # XXX: Fixme agenda = self.scrape_agenda(body.xpath(".//ol")) event = Event(session, when, 'committee:meeting', meeting_title, **kwargs) event.add_participant("host", who, 'committee', chamber=chamber) event.add_source(url) for item in agenda: bill = item['bill'] descr = item['descr'] event.add_related_bill(bill, description=descr, type="consideration") self.save_event(event)
def scrape_event_page(self, url, chamber, session): page = self.lxmlize(url) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = { "txt": val.text_content().strip(), "obj": val } if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf['Date']['txt'], metainf['Time']['txt'] ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM" # This is due to a nasty line they had. } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub("\s+", " ", datetime) flag = "or after committees are given leave" if flag in datetime: datetime = datetime[:datetime.find(flag)].strip() datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title event = Event(session, datetime, 'committee:meeting', title, location=where) event.add_source(url) event.add_source(mi_events) event.add_participant('host', metainf['Committee']['txt'], chamber=chamber) agenda = metainf['Agenda']['obj'] related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: event.add_related_bill( bill.text_content(), description=agenda.text_content(), type='consideration' ) self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [ x.text_content().strip() for x in row.xpath('td') ] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) agenda_info = self.parse_agenda(chamber, link) description = agenda_info['description'] member_list = agenda_info['member_list'] meeting_type = agenda_info['meeting_type'] agenda_items = agenda_info['agenda_items'] related_bills= agenda_info['related_bills'] other = agenda_info['other'] event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) #, #agenda=agenda_items) event.add_participant('committee', committee, 'committee', chamber=chamber) for i in range(0, len(related_bills)): bill = related_bills[i] desc = description[i] event.add_related_bill( bill, description=desc, type="consideration" ) event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape_page(self, url, session, chamber): try: page = self.lxmlize(url) except lxml.etree.XMLSyntaxError: self.warning("Ugh. Invalid HTML") return # Ugh, invalid HTML. agendas = page.xpath("//td[@class='numberspace']") spans = page.xpath("//center/span") ctty = None date = None time = None if len(spans) >= 4: ctty = spans[0].text_content().strip() date = spans[2].text_content().strip() time = spans[3].text_content().strip() bills = [] for agenda in agendas: number = agenda.text_content() string = agenda.getnext().text_content().strip() re_bills = re.findall("(S|H)\.?(B|R|M)\. (\d+)", string) for bill in re_bills: bill_id = '%s%s %s' % bill bills.append({'name': bill_id, 'desc': string}) if ctty is None or date is None or time is None: return datetime = "%s %s" % (date.strip(), time.strip()) datetime = re.sub("AGENDA", "", datetime).strip() datetime = [x.strip() for x in datetime.split("\r\n")] if "" in datetime: datetime.remove("") if len(datetime) == 1: datetime.append("state house") where = datetime[1] translate = {"a.m.": "AM", "p.m.": "PM"} for t in translate: datetime[0] = datetime[0].replace(t, translate[t]) datetime = dt.datetime.strptime(datetime[0], "%A, %B %d, %Y %I:%M %p") chamber = 'other' cLow = ctty.lower() if "seante" in cLow: chamber = 'upper' elif "house" in cLow: chamber = 'lower' elif "joint" in cLow: chamber = 'joint' event = Event(session, datetime, 'committee:meeting', ctty, location=where) event.add_source(url) event.add_participant('host', ctty, 'committee', chamber=chamber) for bill in bills: event.add_related_bill(bill['name'], description=bill['desc'], type='consideration') self.save_event(event)
def scrape(self, chamber, session): url = 'http://leg.colorado.gov/content/committees' if chamber == 'lower': xpath = '//div/h3[text()="House Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'upper': xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'other': # All the links under the headers that don't contain "House" or "Senate" xpath = '//div/h3[not(contains(text(),"House")) and ' \ 'not(contains(text(),"Senate"))]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath('//div[contains(@class,"schedule-item-content")]/h4/a/@href') for link in hearing_links: page = self.lxmlize(link) title = page.xpath('//header/h1[contains(@class,"node-title")]')[0] title = title.text_content().strip() date_day = page.xpath('//div[contains(@class,"calendar-date")]')[0] date_day = date_day.text_content().strip() details = page.xpath('//span[contains(@class, "calendar-details")]')[0] details = details.text_content().split('|') date_time = details[0].strip() location = details[1].strip() if 'Upon Adjournment' in date_time: date = dt.datetime.strptime(date_day, '%A %B %d, %Y') else: date_str = '{} {}'.format(date_day, date_time) date = dt.datetime.strptime(date_str, '%A %B %d, %Y %I:%M %p') agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath('//td[@data-label="Hearing Item" and @colspan="2"]') for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else '' event = Event(session, date, "committee:meeting", title, location, agenda=agenda) event.add_source(link) bills = page.xpath('//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() event.add_related_bill( bill_id, description="hearing item", type="consideration" ) self.save_event(event)
def scrape_upper(self, session): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text') os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]).decode('ascii', 'ignore') if not description: description = '[No description provided by state]' event = Event(session=session, when=time, type='committee:meeting', description=description, location=location) event.add_source(PDF_URL) event.add_participant(type='host', participant=comm, participant_type='committee', chamber='upper') for line in description.split('\n'): related_bill = re.search( r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") event.add_related_bill(bill_id=related_bill, type='consideration', description=relation) self.save_event(event)
def scrape(self, chamber, session): if session != '28': raise NoDataForPeriod(session) if chamber == 'other': return year = now.year # Full calendar year date1 = '0101' + str(year)[2:] date2 = '1231' + str(year)[2:] url = ("http://www.legis.state.ak.us/basis/" "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&" "Comty=&Root=&Sel=1&Button=Display" % ( session, date1, date2)) page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]" for font in page.xpath(path): match = re.match(r'^\((H|S)\)(.+)$', font.text) chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)] comm = match.group(2).strip().title() next_row = font.xpath("../../following-sibling::tr[1]")[0] when = next_row.xpath("string(td[1]/font)").strip() when = re.sub("\s+", " ", when) when = "%s %s" % (when, year) continu = False for slug in exclude_slugs: if slug in when: continu = True for repl in replacements: if repl in when: when = when.replace(repl, replacements[repl]) if continu: continue parsed_when = None for fmt in formats: try: parsed_when = datetime.datetime.strptime(when, fmt) break except ValueError: pass if not parsed_when: raise when = parsed_when if when < now: self.warning("Dropping an event at %s. Be careful!" % ( when )) continue when = self._tz.localize(when) where = next_row.xpath("string(td[2]/font)").strip() description = "Committee Meeting\n" description += comm links = font.xpath( "../../td/font/a[contains(@href, 'get_documents')]") if links: agenda_link = links[0] event['link'] = agenda_link.attrib['href'] cur_node = font.getparent().getparent() bills = [] while cur_node is not None and cur_node.xpath(".//hr") == []: bills += cur_node.xpath( ".//a[contains(@href, 'get_complete_bill')]/text()") cur_node = cur_node.getnext() event = Event(session, when, 'committee:meeting', description, location=where) event.add_source(url) for bill in bills: event.add_related_bill(bill, description='Related Bill', type='consideration') event.add_participant('host', comm, participant_type='committee', chamber=chamber) self.save_event(event)
def scrape_meeting(self, session, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title ,= page.xpath("//a[@id='linkTitle']//text()") date ,= page.xpath("//span[@id='lDate']/text()") time ,= page.xpath("//span[@id='lTime']/text()") location ,= page.xpath("//span[@id='lLocation']/text()") substs = { "AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"], } for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r'(?i)\d[AP]M$', time): time = time[:-2] + " " + time[-2:] if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = datetime.datetime.strptime("%s %s" % ( date, time ), "%B %d, %Y %I:%M %p") # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = {"house": "lower", "senate": "upper", "joint": "joint",} for chamber_, normalized in chambers.items(): if chamber_ in title.lower(): chamber = normalized break else: return event = Event( session, when, 'committee:meeting', description, location=location, all_day=all_day ) event.add_source(url) event.add_participant('host', title, 'committee', chamber=chamber) trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title: bill_chamber = "upper" elif "H" in bill_title: bill_chamber = "lower" else: continue event.add_related_bill(bill_id=bill_title, description=descr.text_content(), chamber=bill_chamber, type='consideration') self.save_event(event)
def scrape(self, chamber, session): url = 'http://leg.colorado.gov/content/committees' if chamber == 'lower': xpath = '//div/h3[text()="House Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'upper': xpath = '//div/h3[text()="Senate Committees of Reference"]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' elif chamber == 'other': # All the links under the headers that don't contain "House" or "Senate" xpath = '//div/h3[not(contains(text(),"House")) and ' \ 'not(contains(text(),"Senate"))]/../' \ 'following-sibling::div[contains(@class,"view-content")]/' \ 'table//td//span[contains(@class,"field-content")]/a/@href' page = self.lxmlize(url) com_links = page.xpath(xpath) for link in com_links: page = self.lxmlize(link) hearing_links = page.xpath( '//div[contains(@class,"schedule-item-content")]/h4/a/@href') for link in hearing_links: page = self.lxmlize(link) title = page.xpath( '//header/h1[contains(@class,"node-title")]')[0] title = title.text_content().strip() date_day = page.xpath( '//div[contains(@class,"calendar-date")]')[0] date_day = date_day.text_content().strip() details = page.xpath( '//span[contains(@class, "calendar-details")]')[0] details = details.text_content().split('|') date_time = details[0].strip() location = details[1].strip() if 'Upon Adjournment' in date_time: date = dt.datetime.strptime(date_day, '%A %B %d, %Y') else: date_str = '{} {}'.format(date_day, date_time) date = dt.datetime.strptime(date_str, '%A %B %d, %Y %I:%M %p') agendas = [] # they overload the bills table w/ other agenda items. colspon=2 is agenda non_bills = page.xpath( '//td[@data-label="Hearing Item" and @colspan="2"]') for row in non_bills: content = row.text_content().strip() agendas.append(content) agenda = "\n".join(agendas) if agendas else '' event = Event(session, date, "committee:meeting", title, location, agenda=agenda) event.add_source(link) bills = page.xpath('//td[@data-label="Hearing Item"]/a') for bill in bills: bill_id = bill.text_content().strip() event.add_related_bill(bill_id, description="hearing item", type="consideration") self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return calendar_url = "http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session) page = self.lxmlize(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = "%02d" % (i + 2) date_xpath = './/span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata["chambers"][chamber]["name"][0].upper() meeting_xpath = './/a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % ( str(row_ident), chamber_char, ) meeting_url = row.xpath(meeting_xpath) if len(meeting_url) == 1 and meeting_url[0].text_content().strip() != "": try: meeting_url = meeting_url[0].attrib["href"] except KeyError: self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip()) continue meeting_page = self.lxmlize(meeting_url) meetings = meeting_page.xpath('.//table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len(meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [meeting_ident, meeting_idents[i + 1] - 1] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + " " + meeting_time, "%m/%d/%Y %I:%M %p" ) meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == "": description = committee event = Event(session, meeting_date_time, "committee:meeting", description, location) event.add_source(meeting_url) for bill in bills: if bill["bill_description"] == "": bill["bill_description"] = committee event.add_related_bill( bill_id=bill["bill_id"], description=bill["bill_description"], type="consideration" ) event.add_document( name=bill["bill_id"], url=bill["bill_url"], type="bill", mimetype="application/pdf" ) event.add_participant( type="host", participant=committee, participant_type="committee", chamber=chamber ) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % ( self.metadata['chambers'][chamber]['name'].upper()[0]) page = self.get_page_from_url(events_url) meeting_year = page.xpath( '//h2[@class="barheader"]/span')[0].text_content() meeting_year = re.search( r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})', meeting_year).group(1) dates = page.xpath("//div[@id='contentsection']/ul") for date in dates: date_string = date.xpath('span') if len(date_string) == 1: date_string = date_string[0].text_content() else: continue # If a event is in the next calendar year, the date_string # will have a year in it if date_string.count(",") == 2: event_year = date_string[-4:] date_string = date_string[:-6] elif date_string.count(",") == 1: event_year = meeting_year else: raise AssertionError("This is not a valid date: '{}'").\ format(date_string) for meeting in date.xpath('li'): time_string = meeting.xpath('span')[0].text_content() if time_string == 'CANCELED' or len( meeting.xpath( './/span[contains(text(), "CANCELED")]')) > 0: continue time_string = self.normalize_time(time_string) date_time = datetime.datetime.strptime( event_year + ' ' + date_string + ' ' + time_string, "%Y %A, %B %d %I:%M %p") date_time = self._tz.localize(date_time) meeting_info = meeting.xpath( 'br[1]/preceding-sibling::node()')[1] location, description = re.search(r'-- (.*?) -- (.*)', meeting_info).groups() if re.search(r'committee', description, re.I): meeting_type = 'committee:meeting' else: meeting_type = 'other:meeting' event = Event(session, date_time, meeting_type, description, location) event.add_source(events_url) agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]") if agenda_url: agenda_url = agenda_url[0].attrib['href'] event.add_source(agenda_url) agenda_page = self.get_page_from_url(agenda_url) for bill in agenda_page.xpath( ".//a[contains(@href,'billsearch.php')]"): bill_url = bill.attrib['href'] bill_id = bill.text_content().replace('.', '').replace( ' ', '') bill_description = self.get_bill_description(bill_url) event.add_related_bill(bill_id=bill_id, type='consideration', description=bill_description) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'upper': url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM" elif chamber == 'lower': url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM" else: return page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for date_td in page.xpath("//td[@valign='middle']"): date = date_td.text_content().strip() datetime.datetime.strptime( date, "%A, %B %d, %Y").date() next_tr = date_td.getparent().getnext() while next_tr is not None: if next_tr.xpath("td[@valign='middle']"): break time = next_tr.xpath("string(td[1])").strip() dt = "%s %s" % (date, time) try: dt = datetime.datetime.strptime( dt, "%A, %B %d, %Y %I:%M %p") dt = self._tz.localize(dt) except ValueError: break desc = next_tr.xpath("string(td[2])").strip() desc_el = next_tr.xpath("td[2]")[0] desc = re.sub(r'\s+', ' ', desc) ctty = None cttyraw = desc.split("COMMITTEE", 1) if len(cttyraw) > 1: ctty = cttyraw[0] related_bills = desc_el.xpath( ".//a[contains(@href, 'billinfo')]") bills = [] urls = [x.attrib['href'] for x in related_bills] for u in urls: o = urlparse.urlparse(u) qs = urlparse.parse_qs(o.query) bills.append({ "bill_id": "%sB %s" % (qs['body'][0], qs['bn'][0]), "bill_num": qs['bn'][0], "bill_chamber": qs['body'][0], "session": qs['syear'][0], "descr": desc }) location = next_tr.xpath("string(td[3])").strip() location = re.sub(r'\s+', ' ', location) event = Event(session, dt, 'committee:meeting', desc, location) event.add_source(url) if not ctty is None: event.add_participant('host', ctty, 'committee', chamber=chamber) for bill in bills: event.add_related_bill( bill['bill_id'], description=bill['descr'], type='consideration' ) self.save_event(event) next_tr = next_tr.getnext()
def scrape(self, chamber, session): if chamber != 'other': return None # We're going to do it all on one shot. if session[-2:] == "s1": return None # Special sessions 404 url = "http://mlis.state.md.us/%s/hearsch/alladd.htm" % (session) page = self.lxmlize(url) events = page.xpath("//pre") for event in events: ctty_name = [ x.strip() for x in event.getparent().getprevious().text_content().split( "-", 1) ] ctty_name = ctty_name[0] event_text = event.text_content() if "This meeting has been cancelled." in event_text: continue # OK. In order to process this text-only notice, we have to resort # to some major hackage. Just roll with it. lines = event_text.split("\n") # In order to get the key stuff, we need to figure out where the # address "block" starts. address_block = last_space(lines[4]) assert address_block is not None # OK. Given the offset, we can "split" the time off the date block. time_room = lines[3] time = time_room[:address_block].strip() if "TBD" in time: continue # Nothing's set yet. time = "%s %s" % (lines[1], time) time = re.sub("\s+", " ", time).strip() trans = {"P.M.": "PM", "A.M.": "AM"} for transition in trans: time = time.replace(transition, trans[transition]) when = dt.datetime.strptime(time, "%A %B %d, %Y %I:%M %p") room = time_room[address_block:].strip() place_block = lines[4:] where = room + "\n" done = False offset = 4 for place in place_block: if place.strip() == "": done = True if done: continue offset += 1 where += place.strip() + "\n" where = where.strip() # Now that the date's processed, we can move on. moreinfo = lines[offset + 1:] info = {} key = "unattached_header" for inf in moreinfo: if ":" in inf: key, value = inf.split(":", 1) key = key.strip() info[key] = value.strip() else: info[key] += " " + inf.strip() # Alright. We should have enough now. subject = info['Subject'] event = Event(session, when, 'committee:meeting', subject, location=where) event.add_source(url) flags = {"joint": "joint", "house": "lower", "senate": "upper"} chamber = "other" for flag in flags: if flag in ctty_name.lower(): chamber = flags[flag] # Let's try and hack out some bill names. trans = { "SENATE": "S", "HOUSE": "H", "JOINT": "J", "BILL": "B", "RESOLUTION": "R", } _t_subject = subject.upper() for t in trans: regex = "%s(\s+)?" % t _t_subject = re.sub(regex, trans[t], _t_subject) print _t_subject bills = re.findall("(S|H)(J)?(B|R|M)\s*(\d{4})", _t_subject) for bill in bills: name = bill[:3] bid = bill[3] bill_id = "%s %s" % (''.join(name), bid) event.add_related_bill(bill_id, description=subject, type='consideration') event.add_participant("host", ctty_name, chamber=chamber) self.save_event(event)
def scrape_meeting(self, session, chamber, url): page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) title, = page.xpath("//a[@id='linkTitle']//text()") date, = page.xpath("//span[@id='lDate']/text()") time, = page.xpath("//span[@id='lTime']/text()") location, = page.xpath("//span[@id='lLocation']/text()") if ("UPON ADJOURNMENT" in time.upper() or "UPON ADJOURNMENT" in time.upper()): return substs = { "AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m."], } for key, values in substs.items(): for value in values: time = time.replace(value, key) try: when = datetime.datetime.strptime("%s %s" % (date, time), "%B %d, %Y %I:%M %p") except ValueError: when = datetime.datetime.strptime("%s %s" % (date, time), "%B %d, %Y %I:%M") # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = { "house": "lower", "senate": "upper", "joint": "joint", } for chamber_, normalized in chambers.items(): if chamber_ in title.lower(): chamber = normalized break else: return event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', title, 'committee', chamber=chamber) trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title: bill_chamber = "upper" elif "H" in bill_title: bill_chamber = "lower" else: continue event.add_related_bill(bill_id=bill_title, description=descr.text_content(), chamber=bill_chamber, type='consideration') self.save_event(event)
def scrape(self, chamber, session): if chamber == 'upper': url = "http://www.legis.state.pa.us/WU01/LI/CO/SM/COSM.HTM" elif chamber == 'lower': url = "http://www.legis.state.pa.us/WU01/LI/CO/HM/COHM.HTM" else: return page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for date_td in page.xpath("//td[@valign='middle']"): date = date_td.text_content().strip() datetime.datetime.strptime(date, "%A, %B %d, %Y").date() next_tr = date_td.getparent().getnext() while next_tr is not None: if next_tr.xpath("td[@valign='middle']"): break time = next_tr.xpath("string(td[1])").strip() dt = "%s %s" % (date, time) try: dt = datetime.datetime.strptime(dt, "%A, %B %d, %Y %I:%M %p") dt = self._tz.localize(dt) except ValueError: break desc = next_tr.xpath("string(td[2])").strip() desc_el = next_tr.xpath("td[2]")[0] desc = re.sub(r'\s+', ' ', desc) ctty = None cttyraw = desc.split("COMMITTEE", 1) if len(cttyraw) > 1: ctty = cttyraw[0] related_bills = desc_el.xpath( ".//a[contains(@href, 'billinfo')]") bills = [] urls = [x.attrib['href'] for x in related_bills] for u in urls: o = urlparse.urlparse(u) qs = urlparse.parse_qs(o.query) bills.append({ "bill_id": "%sB %s" % (qs['body'][0], qs['bn'][0]), "bill_num": qs['bn'][0], "bill_chamber": qs['body'][0], "session": qs['syear'][0], "descr": desc }) location = next_tr.xpath("string(td[3])").strip() location = re.sub(r'\s+', ' ', location) event = Event(session, dt, 'committee:meeting', desc, location) event.add_source(url) if not ctty is None: event.add_participant('host', ctty, 'committee', chamber=chamber) for bill in bills: event.add_related_bill(bill['bill_id'], description=bill['descr'], type='consideration') self.save_event(event) next_tr = next_tr.getnext()
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] if time in all_day: datetime = date else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime: # XXX: Do something more advanced. return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = {"house": "lower", "joint": "joint", "senate": "upper"} chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, 'committee', chamber=chamber) self.save_event(event)
def scrape_meeting(self, session, chamber, url): page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) title ,= page.xpath("//a[@id='linkTitle']//text()") date ,= page.xpath("//span[@id='lDate']/text()") time ,= page.xpath("//span[@id='lTime']/text()") location ,= page.xpath("//span[@id='lLocation']/text()") if time == "UPON ADJOURNMENT": return if "A.M." in time: time = time.replace("A.M.", "AM") try: when = datetime.datetime.strptime("%s %s" % ( date, time ), "%B %d, %Y %I:%M %p") except ValueError: when = datetime.datetime.strptime("%s %s" % ( date, time ), "%B %d, %Y %I:%M") description = "Meeting on %s of the %s" % (date, title) chambers = {"house": "lower", "senate": "upper", "joint": "joint",} for chamber_, normalized in chambers.items(): if chamber_ in title.lower(): chamber = normalized break else: return event = Event( session, when, 'committee:meeting', description, location=location ) event.add_source(url) event.add_participant('host', title, 'committee', chamber=chamber) trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title: bill_chamber = "upper" elif "H" in bill_title: bill_chamber = "lower" else: continue event.add_related_bill(bill_id=bill_title, description=descr.text_content(), chamber=bill_chamber, type='consideration') self.save_event(event)
def scrape_meeting(self, session, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title, = page.xpath("//a[@id='linkTitle']//text()") date, = page.xpath("//span[@id='lDate']/text()") time, = page.xpath("//span[@id='lTime']/text()") location, = page.xpath("//span[@id='lLocation']/text()") substs = { "AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"], } for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r'(?i)\d[AP]M$', time): time = time[:-2] + " " + time[-2:] if "UPON ADJ" in ' '.join(time.split()).upper(): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = datetime.datetime.strptime("%s %s" % (date, time), "%B %d, %Y %I:%M %p") # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = { "house": "lower", "senate": "upper", "joint": "joint", } for chamber_, normalized in chambers.items(): if chamber_ in title.lower(): chamber = normalized break else: return event = Event(session, when, 'committee:meeting', description, location=location, all_day=all_day) event.add_source(url) event.add_participant('host', title, 'committee', chamber=chamber) trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title: bill_chamber = "upper" elif "H" in bill_title: bill_chamber = "lower" else: continue event.add_related_bill(bill_id=bill_title, description=descr.text_content(), chamber=bill_chamber, type='consideration') self.save_event(event)
def scrape_page(self, chamber, session): url = pages[chamber] page = self.lxmlize(url) rows = page.xpath("//table[@class='MsoNormalTable']/tr") header = rows[0] rows = rows[1:] dates = {} dIdex = 0 for row in header.xpath(".//td")[1:]: date = row.text_content() date = re.sub("\s+", " ", date).strip() dates[dIdex] = date dIdex += 1 def _parse_time_block(block): if block.strip() == "No Meeting": return None, None, [] bills = [] room = None blocks = [x.strip() for x in block.split("\n")] hour = re.sub("\(.*\)", "", blocks[1]) bills = blocks[2] if "after" in hour or "after" in bills: return None, None, [] # Extra time cleanup # "Rm" if "Rm" in hour: inf = hour.split("Rm") assert len(inf) == 2 room = inf[1] hour = inf[0] # "and" hour = [x.strip() for x in hour.split('and')] # We'll pass over this twice. single_bill = re.search("(H|S)(B|R) \d{3}", bills) if single_bill is not None: start, end = single_bill.regs[0] description = bills bills = bills[start:end] bills = [{"bill_id": bills, "description": description}] else: multi_bills = re.search("(H|S)(B|R|M)s (\d{3}(; )?)+", bills) if multi_bills is not None: # parse away. bill_array = bills.split() type = bill_array[0] bill_array = bill_array[1:] bill_array = [ x.replace(";", "").strip() for x in bill_array ] type = type.replace("s", "") bill_array = [{ "bill_id": "%s %s" % (type, x), "description": bills } for x in bill_array] bills = bill_array return hour, room, bills for row in rows: tds = row.xpath(".//td") ctty = re.sub("\s+", " ", tds[0].text_content().strip()) times = tds[1:] for i in range(0, len(times)): hours, room, bills = _parse_time_block(times[i].text_content()) if hours is None or bills == []: continue for hour in hours: datetime = "%s %s" % (dates[i], hour) datetime = datetime.encode("ascii", "ignore") # DAY_OF_WEEK MONTH/DAY/YY %I:%M %p" datetime = dt.datetime.strptime(datetime, "%A %m/%d/%y %I:%M %p") event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', "Room %s" % (room)) event.add_source(url) for bill in bills: event.add_related_bill(bill['bill_id'], description=bill['description'], type='consideration') event.add_participant("host", ctty, chamber=chamber) self.save_event(event)
def scrape_event_page(self, url, chamber, session): page = self.lxmlize(url) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = {"txt": val.text_content().strip(), "obj": val} if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % (metainf['Date']['txt'], metainf['Time']['txt']) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM" #another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub("\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**" ]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace('p.m.', 'pm') datetime = datetime.replace('Noon', "pm") datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title if chamber == 'other': chamber = 'joint' event = Event(session, datetime, 'committee:meeting', title, location=where) event.add_source(url) event.add_source(mi_events) event.add_participant('chair', metainf['Chair']['txt'], 'legislator', chamber=chamber) event.add_participant('host', metainf['Committee']['txt'], 'committee', chamber=chamber) agenda = metainf['Agenda']['obj'] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a event.add_related_bill(bill.text_content(), description=description, type='consideration') self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)) page = self.lxmlize(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = '%02d' % (i + 2) date_xpath = ('.//span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata['chambers'][chamber]['name'][0].upper( ) meeting_xpath = ('.//a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % (str(row_ident), chamber_char)) meeting_url = row.xpath(meeting_xpath) if (len(meeting_url) == 1 and meeting_url[0].text_content().strip() != ''): try: meeting_url = meeting_url[0].attrib['href'] except KeyError: self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip()) continue meeting_page = self.lxmlize(meeting_url) meetings = meeting_page.xpath( './/table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len( meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [ meeting_ident, meeting_idents[i + 1] - 1 ] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p') meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == '': description = committee event = Event(session, meeting_date_time, 'committee:meeting', description, location) event.add_source(meeting_url) for bill in bills: if bill['bill_description'] == '': bill['bill_description'] = committee event.add_related_bill( bill_id=bill['bill_id'], description=bill['bill_description'], type='consideration') event.add_document(name=bill['bill_id'], url=bill['bill_url'], type='bill', mimetype='application/pdf') event.add_participant(type='host', participant=committee, participant_type='committee', chamber=chamber) self.save_event(event)
def scrape_lower(self, session): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text') os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1: ]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1: ]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]).decode('ascii', 'ignore') if not description: description = '[No description provided by state]' event = Event( session=session, when=time, type='committee:meeting', description=description, location=location ) event.add_source(PDF_URL) event.add_participant( type='host', participant=comm, participant_type='committee', chamber='lower' ) for line in description.split('\n'): related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") event.add_related_bill( bill_id=related_bill, type='consideration', description=relation ) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)) page = self.get_page_from_url(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = '%02d' % (i + 2) date_xpath = ('.//span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata['chambers'][ chamber]['name'][0].upper() meeting_xpath = ('.//a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % ( str(row_ident), chamber_char )) meeting_url = row.xpath(meeting_xpath) if (len(meeting_url) == 1 and meeting_url[0].text_content().strip() != ''): meeting_url = meeting_url[0].attrib['href'] meeting_page = self.get_page_from_url(meeting_url) meetings = meeting_page.xpath( './/table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len(meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [ meeting_ident, meeting_idents[i + 1] - 1 ] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p') meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == '': description = committee event = Event( session, meeting_date_time, 'committee:meeting', description, location ) event.add_source(meeting_url) for bill in bills: if bill['bill_description'] == '': bill['bill_description'] = committee event.add_related_bill( bill_id=bill['bill_id'], description=bill['bill_description'], type='consideration' ) event.add_document( name=bill['bill_id'], url=bill['bill_url'], type='bill', mimetype='application/pdf' ) event.add_participant( type='host', participant=committee, participant_type='committee', chamber=chamber ) self.save_event(event)
def scrape(self, chamber, session): cha = { "upper" : "senate", "lower" : "house", "other" : "joint" }[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) then = now + timedelta(weeks=4) end = then.strftime(print_format) url = event_page % ( cha, start, end ) page = self.lxmlize(url) def _split_tr(trs): ret = [] cur = [] for tr in trs: if len(tr.xpath(".//hr")) > 0: ret.append(cur) cur = [] continue cur.append(tr) if cur != []: ret.append(cur) return ret tables = page.xpath("//table[@class='AgendaCommittee']") for table in tables: # grab agenda, etc trs = table.xpath(".//tr") events = _split_tr(trs) for event in events: assert len(event) == 2 header = event[0] body = event[1] whowhen = header.xpath(".//h2")[0].text_content() blocks = [ x.strip() for x in whowhen.rsplit("-", 1) ] who = blocks[0] when = blocks[1].replace(u'\xa0', ' ') if "TBA" in when: continue # XXX: Fixme descr = body.xpath(".//*") flush = False where = body.xpath(".//br")[1].tail if where is not None: where = where.strip() else: where = "unknown" when = dt.datetime.strptime(when, "%m/%d/%y %I:%M %p") meeting_title = "Scheduled Meeting" # XXX: Fixme agenda = self.scrape_agenda(body.xpath(".//ol")) event = Event(session, when, 'committee:meeting', meeting_title, location=where) event.add_participant( "host", who, chamber=chamber ) event.add_source(url) for item in agenda: bill = item['bill'] descr = item['descr'] event.add_related_bill( bill, description=descr, type="consideration" ) self.save_event(event)
def scrape(self, chamber, session): if chamber != 'other': return None # We're going to do it all on one shot. if session[-2:] == "s1": return None # Special sessions 404 url = "http://mlis.state.md.us/%s/hearsch/alladd.htm" % ( session ) page = self.lxmlize(url) events = page.xpath("//pre") for event in events: ctty_name = [ x.strip() for x in event.getparent().getprevious().text_content().split("-", 1) ] ctty_name = ctty_name[0] event_text = event.text_content() if "This meeting has been cancelled." in event_text: continue # OK. In order to process this text-only notice, we have to resort # to some major hackage. Just roll with it. lines = event_text.split("\n") # In order to get the key stuff, we need to figure out where the # address "block" starts. address_block = last_space(lines[4]) assert address_block is not None # OK. Given the offset, we can "split" the time off the date block. time_room = lines[3] time = time_room[:address_block].strip() if "TBD" in time: continue # Nothing's set yet. time = "%s %s" % ( lines[1], time ) time = re.sub("\s+", " ", time).strip() trans = { "P.M." : "PM", "A.M." : "AM" } for transition in trans: time = time.replace(transition, trans[transition]) when = dt.datetime.strptime(time, "%A %B %d, %Y %I:%M %p") room = time_room[address_block:].strip() place_block = lines[4:] where = room + "\n" done = False offset = 4 for place in place_block: if place.strip() == "": done = True if done: continue offset += 1 where += place.strip() + "\n" where = where.strip() # Now that the date's processed, we can move on. moreinfo = lines[offset + 1:] info = {} key = "unattached_header" for inf in moreinfo: if ":" in inf: key, value = inf.split(":", 1) key = key.strip() info[key] = value.strip() else: info[key] += " " + inf.strip() # Alright. We should have enough now. subject = info['Subject'] event = Event(session, when, 'committee:meeting', subject, location=where) event.add_source(url) flags = { "joint": "joint", "house": "lower", "senate": "upper" } chamber = "other" for flag in flags: if flag in ctty_name.lower(): chamber = flags[flag] # Let's try and hack out some bill names. trans = { "SENATE": "S", "HOUSE": "H", "JOINT": "J", "BILL": "B", "RESOLUTION": "R", } _t_subject = subject.upper() for t in trans: regex = "%s(\s+)?" % t _t_subject = re.sub(regex, trans[t], _t_subject) print _t_subject bills = re.findall("(S|H)(J)?(B|R|M)\s*(\d{4})", _t_subject) for bill in bills: name = bill[:3] bid = bill[3] bill_id = "%s %s" % ( ''.join(name), bid ) event.add_related_bill(bill_id, description=subject, type='consideration') event.add_participant("host", ctty_name, chamber=chamber) self.save_event(event)
def scrape_committee_agendas(self, chamber, session): """ Scrape upper or lower committee agendas """ # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % \ self._chamber_short[chamber] with self.urlopen(url) as agendas: root = html.fromstring(agendas) if chamber == 'upper': event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table')[0] else: event_table = root.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [ x.text_content().strip() for x in row.xpath('td') ] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match('(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') when = self._tz.localize(when) title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) agenda_info = self.parse_agenda(chamber, link) description = agenda_info['description'] member_list = agenda_info['member_list'] meeting_type = agenda_info['meeting_type'] agenda_items = agenda_info['agenda_items'] related_bills= agenda_info['related_bills'] other = agenda_info['other'] event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description) #, #agenda=agenda_items) event.add_participant('committee', committee, chamber=chamber) for bill in related_bills: event.add_related_bill(bill, type="consideration") event['participants'].extend(member_list) event.add_source(url) event.add_source(link) self.save_event(event)
def scrape_page(self, chamber, session): url = pages[chamber] page = self.lxmlize(url) rows = page.xpath("//table[@class='MsoNormalTable']/tr") header = rows[0] rows = rows[1:] week_of = page.xpath("//h3[@align='center']/b/text()")[0] match = re.match( "(?i)Week of (?P<month>.*) (?P<day>\d+), (?P<year>\d{4})", week_of) day_info = match.groupdict() monday_dom = int(day_info['day']) days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] dates = {} dIdex = 0 for row in header.xpath(".//td")[1:]: date = row.text_content() date = re.sub("\s+", " ", date).strip() dates[dIdex] = date dIdex += 1 def _parse_time_block(block): if block.strip() == "No Meeting": return None, None, [] bills = [] room = None blocks = [ x.strip() for x in block.split("\n") ] hour = re.sub("\(.*\)", "", blocks[1]) bills = blocks[2] bills = bills.encode('ascii', errors='ignore') if "after" in hour or "after" in bills: return None, None, [] # Extra time cleanup # "Rm" if "Rm" in hour: inf = hour.split("Rm") assert len(inf) == 2 room = inf[1] hour = inf[0] # "and" hour = [ x.strip() for x in hour.split('and') ] # We'll pass over this twice. single_bill = re.search("(H|S)(C?)(B|R) \d+", bills) if single_bill is not None: start, end = single_bill.regs[0] description = bills bills = bills[start:end] bills = [{ "bill_id": bills, "description": description }] else: multi_bills = re.search("(H|S)(B|R|M)s (\d+((;,) )?)+", bills) if multi_bills is not None: # parse away. bill_array = bills.split() type = bill_array[0] bill_array = bill_array[1:] def _c(f): for thing in [";", ",", "&", "*"]: f = f.replace(thing, "") return re.sub("\s+", " ", f).strip() bill_array = [_c(x) for x in bill_array] type = type.replace("s", "") bill_array = [ { "bill_id": "%s %s" % ( type, x ), "description": bills } for x in bill_array ] bills = bill_array else: self.warning("Unknwon bill thing: %s" % (bills)) bills = [] return hour, room, bills for row in rows: tds = row.xpath(".//td") ctty = re.sub("\s+", " ", tds[0].text_content().strip()) times = tds[1:] for i in range(0, len(times)): hours, room, bills = _parse_time_block(times[i].text_content()) if hours is None or bills == []: continue for hour in hours: datetime = "%s %s" % ( dates[i], hour ) datetime = datetime.encode("ascii", "ignore") # DAY_OF_WEEK MONTH/DAY/YY %I:%M %p" dow, time = datetime.split() month = day_info['month'] year = day_info['year'] day = monday_dom + days.index(dow.lower()) datetime = "%s %s %s, %s %s" % ( dow, month, day, year, time ) formats = [ "%A %B %d, %Y %I:%M %p", "%A %B %d, %Y %I:%M%p", "%A %B %d, %Y %I %p", "%A %B %d, %Y %I%p", ] dtobj = None for fmt in formats: try: dtobj = dt.datetime.strptime(datetime, fmt) except ValueError as e: continue if dtobj is None: self.warning("Unknown guy: %s" % (datetime)) raise Exception datetime = dtobj event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', "Room %s" % (room)) event.add_source(url) for bill in bills: event.add_related_bill( bill['bill_id'], description=bill['description'], type='consideration' ) event.add_participant("host", ctty, 'committee', chamber=chamber) self.save_event(event)