def scrape_lower_event(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0] meta = table.xpath("tr[1]/td[1]/text()") # careful, the committee name in the page #committee_div # is getting inserted via JS # so use the one from the table, and strip the chair name com_name = re.sub(r"\(.*\)", "", meta[0]) com_name = f"Assembly {com_name}" when = dateutil.parser.parse(meta[1]) when = self._tz.localize(when) location = meta[2] event = Event( name=com_name, start_date=when, location_name=location, ) event.add_participant(com_name, type="committee", note="host") event.add_source(url) if table.xpath('.//a[contains(@href, "/leg/")]'): agenda = event.add_agenda_item("Bills under Consideration") for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'): agenda.add_bill(bill_link.text_content().strip()) yield event
def upper_parse_agenda_item(self, item): response = self.api_client.get( "meeting", year=item["agendaId"]["year"], agenda_id=item["agendaId"]["number"], committee=item["committeeId"]["name"], ) data = response["result"] chamber = data["committee"]["committeeId"]["chamber"].title() com_code = data["committee"]["committeeId"]["name"] com_name = f"{chamber} {com_code}" # each "meeting" is actually a listing page of multiple meetings of the same committee # broken out by different addendumId for addendum in data["committee"]["addenda"]["items"]: if addendum["addendumId"] != item["addendum"]: continue meeting = addendum["meeting"] when = dateutil.parser.parse(meeting["meetingDateTime"]) when = self._tz.localize(when) location = meeting["location"] description = meeting["notes"] if location == "": location = "See Committee Site" if "canceled" in description.lower(): continue event = Event( name=com_name, start_date=when, location_name=location, description=description, ) event.add_participant(com_name, type="committee", note="host") com_code = (com_code.lower().replace("'", "").replace(" ", "-").replace( ",", "")) url = f"https://www.nysenate.gov/committees/{com_code}" event.add_source(url) bills = addendum["bills"]["items"] if len(bills) > 0: agenda = event.add_agenda_item("Bills under consideration") for bill in bills: agenda.add_bill(bill["billId"]["printNo"]) yield event
def scrape_events(self, session, start_date): session_key = SESSION_KEYS[session] if start_date is None: start_date = datetime.date.today() else: start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") committees_by_code = {} committees_response = self.api_client.get("committees", session=session_key) for committee in committees_response: committees_by_code[ committee["CommitteeCode"]] = committee["CommitteeName"] meetings_response = self.api_client.get( "committee_meetings", start_date=start_date.strftime(self._DATE_FORMAT), session=session_key, ) if len(meetings_response) == 0: raise EmptyScrape for meeting in meetings_response: event_date = self._TZ.localize( datetime.datetime.strptime(meeting["MeetingDate"], self._DATE_FORMAT)) com_name = committees_by_code[meeting["CommitteeCode"]] event = Event(start_date=event_date, name=com_name, location_name=meeting["Location"]) event.add_source(meeting["AgendaUrl"]) event.extras["meeting_guid"] = meeting["MeetingGuid"] event.extras["committee_code"] = committee["CommitteeCode"] event.add_participant(com_name, type="committee", note="host") for row in meeting["CommitteeAgendaItems"]: if row["Comments"] is not None: agenda = event.add_agenda_item(row["Comments"]) if row["MeasureNumber"] is not None: bill_id = "{} {}".format(row["MeasurePrefix"], row["MeasureNumber"]) agenda.add_bill(bill_id) for row in meeting["CommitteeMeetingDocuments"]: event.add_document( note=row["ExhibitTitle"], url=row["DocumentUrl"], on_duplicate="ignore", ) yield event
def scrape(self, start=None, end=None): if start is None: start = dt.datetime.today() else: start = dateutil.parser.parse(start) if end is None: end = start + relativedelta(months=+3) else: end = dateutil.parser.parse(end) start = start.strftime("%Y-%m-%d") end = end.strftime("%Y-%m-%d") url = f"{self.base_url}calendar-data?start={start}&end={end}" data = json.loads(self.scraper.get(url).content) for item in data: name = item["title"].strip() if "canceled" in name.lower(): continue if "house session" in name.lower( ) or "senate session" in name.lower(): continue url = f"{self.base_url}{item['url']}" when = dateutil.parser.parse(item["start"]) when = self._tz.localize(when) page = self.scraper.get(url).content page = lxml.html.fromstring(page) location = page.xpath( '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()' )[0].strip() agenda_url = page.xpath( '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href' )[0] event = Event( name=name, start_date=when, location_name=location, ) event.add_participant(name, type="committee", note="host") event.add_document("Agenda", agenda_url, media_type="application/pdf") event.add_source(url) yield event
def scrape_upper(self): listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm" html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split("<hr />")[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, "Date:") when_time = self.row_content(page, "Time:") location = self.row_content(page, "Room:") location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # com = self.row_content(page, 'Committee:') com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(", Senator")[0].strip() start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant(com, type="committee", note="host") for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = "" if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath("string(tr[2])").strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath("string(tr[1])").strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def scrape_cal_page(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath("//article[contains(@class,'accordion')]"): when = row.xpath(".//time/@datetime")[0] when = dateutil.parser.parse(when) title = row.xpath( ".//h3[contains(@class,'heading-link')]/text()")[0].strip() description = row.xpath( "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]" )[0].text_content() # fix special chars description = (description.replace("\n\u2013", " ").replace( "\n", " ").replace("\u203a", "")) description = description.replace("More about this event", "").strip() location = row.xpath( "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p" )[0].text_content() event = Event( name=title, description=description, start_date=when, location_name=location, ) agenda_url = row.xpath( ".//a[contains(text(),'More about this event')]/@href") if agenda_url != []: event.add_document("Details and Agenda", agenda_url[0], media_type="text/html") if "committee meeting" in title.lower(): com_name = title.replace("Committee Meeting", "").strip() event.add_participant(com_name, type="commitee", note="host") event.add_source(url) yield event if page.xpath("//a[contains(text(), 'Upcoming Events')]"): next_url = page.xpath( "//a[contains(text(), 'Upcoming Events')]/@href")[0] yield from self.scrape_cal_page(next_url)
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], "MISCELLANEOUS", ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, "Committee:") when_date = self.table_row_content(page, "Date:") when_time = self.table_row_content(page, "Time:") location = self.table_row_content(page, "Location:") if "house hearing room" in location.lower(): location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # fix some broken times, e.g. '12 :00' when_time = when_time.replace(" :", ":") # a.m. and p.m. seem to confuse dateutil.parser when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM") # some times have extra info after the AM/PM if "upon" in when_time: when_time = when_time.split("AM", 1)[0] when_time = when_time.split("PM", 1)[0] # fix '- Upcoming', '- In Progress' in dates when_date = re.sub(r"- (.*)", "", when_date).strip() try: start_date = dateutil.parser.parse(f"{when_date} {when_time}") except dateutil.parser._parser.ParserError: start_date = dateutil.parser.parse(when_date) start_date = self._TZ.localize(start_date) event = Event(start_date=start_date, name=com, location_name=location) event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx") event.add_participant(com, type="committee", note="host") # different from general MO link xpath due to the <b> house_link_xpath = ('.//a[contains(@href, "Bill.aspx") ' 'or contains(@href, "bill.aspx")]/b/text()') for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split("--")[0].strip() bill_no = bill_no.replace("HCS", "").strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, "//wa:CommitteeMeeting"): event_cancelled = xpath(row, "string(wa:Cancelled)") if event_cancelled == "true": continue event_chamber = xpath(row, "string(wa:Agency)") if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S" ) event_date = self._tz.localize(event_date) event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)") agenda_id = xpath(row, "string(wa:AgendaId)") notes = xpath(row, "string(wa:Notes)") room = xpath(row, "string(wa:Room)") building = xpath(row, "string(wa:Building)") # XML has a wa:Address but it seems useless city = xpath(row, "string(wa:City)") state = xpath(row, "string(wa:State)") location = "{}, {}, {} {}".format(room, building, city, state) event = Event( name=event_com, start_date=event_date, location_name=location, description=notes, ) source_url = ( "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format( agenda_id ) ) event.add_source(source_url) event.add_participant(event_com, type="committee", note="host") event.extras["agendaId"] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event(start_date=start_date, end_date=end_date, name=title, location_name=location) event.add_source( "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx") for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath("@href")[0], media_type="application/pdf", on_duplicate="ignore", ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link("Video of Hearing", video[0].xpath("@href")[0], "text/html") if "subcommittee" in title.lower(): subcom = title.split("-")[0].strip() event.add_participant(subcom, type="committee", note="host") else: event.add_participant(com, type="committee", note="host") yield event
def scrape(self, session=None, chamber=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO(page.text), delimiter="|") for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += next(page) desc = row[7].strip() match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc) if match: comm = match.group(1).strip() comm = re.sub(r"\s+", " ", comm) location = row[5].strip() or "Unknown" when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S") when = self._tz.localize(when) # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue description = "%s MEETING" % comm event = Event( name=description, start_date=when, location_name=location, description=description, ) event.add_source(url) event.add_participant(comm, type="committee", note="host") # time = row[3].strip() # if time in _TIMECODES: # event['notes'] = TIMECODES[time] yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf["Location:"] subject_matter = metainf["Subject Matter:"] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf["Scheduled Date:"] datetime = re.sub(r"\s+", " ", datetime) repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize( dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith("Hearing Notice For"): ctty_name.replace("Hearing Notice For", "") event.add_participant(ctty_name, "organization") bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub(r"\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo["COMMITTEE"] where = metainfo["PLACE"] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo["PLACE"] = where.strip() metainfo["CHAIR"] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo["CHAIR"] plaintext = re.sub(r"\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event(name=committee, start_date=self._tz.localize(datetime), location_name=where) event.dedupe_key = url event.add_source(url) event.add_participant(committee, type="committee", note="host") if chair is not None: event.add_participant(chair, type="legislator", note="chair") # add a single agenda item, attach all bills agenda = event.add_agenda_item(plaintext) for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) agenda.add_bill(bill_id) yield event
def scrape(self): url = "https://www.ncleg.gov/LegislativeCalendar/" page = self.lxmlize(url) page.make_links_absolute(url) for day_row in page.xpath('//div[@class="row cal-event-day"]'): date = day_row.xpath( './/div[contains(@class, "cal-event-day-full")]/text()' )[0].strip() for row in day_row.xpath( './/div[contains(@class, "cal-event row")]'): # first cal-event-row sometimes contains full date, skip that time = row.xpath( 'div[contains(@class,"col-12 text-left col-sm-3 text-sm-right")]/text()' )[0].strip() event_row = row.xpath( 'div[contains(@class,"col-12 col-sm-9 col-md-12 ")]')[0] # skip floor sessions if event_row.xpath( './/a[contains(text(), "Session Convenes")]'): continue chamber = "" if len( event_row.xpath( 'span[contains(@class, "text-dark font-weight-bold")]/text()' )): chamber = event_row.xpath( 'span[contains(@class, "text-dark font-weight-bold")]/text()' )[0].strip() chamber = chamber.replace(":", "") # sometimes there are unlinked events, usually just press conferences if not event_row.xpath('a[contains(@href,"/Committees/")]'): continue com_link = event_row.xpath( 'a[contains(@href,"/Committees/")]')[0] com_name = com_link.text_content().strip() com_name = f"{chamber} {com_name}".strip() com_url = com_link.xpath("@href")[0] where = (row.xpath('div[contains(@class,"col-12 offset-sm-3")]' )[0].text_content().strip()) where = where.replace("STREAM", "") when = f"{date} {time}" try: when = dateutil.parser.parse(when) # occasionally they'd do 9am-1pm which confuses the TZ detection when = self._tz.localize(when) except (ParserError, ValueError): self.warning( f"Unable to parse {time}, only using day component") when = dateutil.parser.parse(date) when = self._tz.localize(when).date() event = Event( name=com_name, start_date=when, location_name=where, classification="committee-meeting", ) event.add_source(com_url) event.add_participant(com_name, type="committee", note="host") # NOTE: if you follow the committee link, there are agenda PDF links # but they don't load at all as of 2021-02-01 -- showerst for agenda_row in event_row.xpath(".//p"): agenda_text = agenda_row.text_content().strip() if agenda_text != "": agenda = event.add_agenda_item(agenda_text) for bill_row in agenda_row.xpath( './/a[contains(@href,"BillLookUp")]/text()'): agenda.add_bill(bill_row.split(":")[0]) yield event
def scrape_lower(self): url = "https://www.house.leg.state.mn.us/Schedules/All" page = self.lxmlize(url) for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'): # print(row.text_content()) # skip floor sessions and unlinked events if not row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b' ): continue # skip joint ones, we'll get those from the senate API if row.xpath('div[contains(@class,"card-header bg-joint")]'): continue # top-level committee com = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' )[0].strip() com_link = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href' )[0] when = (row.xpath( 'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()' )[0].replace("\r\n", "").strip()) when = dateutil.parser.parse(when) when = self._tz.localize(when) if row.xpath('.//b[.="Location:"]'): where = row.xpath( './/b[.="Location:"]/following-sibling::text()[1]' )[0].strip() else: where = "See committee page" if row.xpath('.//b[.="Agenda:"]'): desc = "\n".join( row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()' )).strip() else: desc = "See committee page" event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) event.add_source(com_link) for bill in get_bill_ids(desc): event.add_bill(desc) if row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]" ): agenda = event.add_agenda_item("Bills") for bill_id in row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()" ): agenda.add_bill(bill_id.strip()) for attachment in row.xpath(".//ul/li/div/a"): doc_url = attachment.xpath("@href")[0] doc_name = attachment.xpath("text()")[0].strip() # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url) # sometimes broken links to .msg files (emails?) are attached, # they always 404. if doc_url.endswith(".msg"): continue media_type = get_media_type(doc_url) event.add_document(doc_name, doc_url, media_type=media_type, on_duplicate="ignore") for committee in row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' ): event.add_participant(committee, type="committee", note="host") yield event
def scrape_senate(self): url = "https://www.senate.gov/general/committee_schedules/hearings.xml" page = self.get(url).content page = lxml.etree.fromstring(page) rows = page.xpath("//meeting") for row in rows: com = row.xpath("string(committee)") if com == "": continue com = "Senate {}".format(com) address = row.xpath("string(room)") parts = address.split("-") building_code = parts[0] if self.buildings.get(building_code): address = "{}, Room {}".format( self.buildings.get(building_code), parts[1]) agenda = row.xpath("string(matter)") try: event_date = datetime.datetime.strptime( row.xpath("string(date)"), "%d-%b-%Y %H:%M %p") except ValueError: event_date = datetime.datetime.strptime( row.xpath("string(date)"), "%d-%b-%Y") event_date = self._TZ.localize(event_date) event = Event(start_date=event_date, name=com, location_name=address) agenda_item = event.add_agenda_item(description=agenda) # ex: Business meeting to consider S.785, to improve mental... matches = re.findall(r"\s(\w+)\.(\d+),", agenda) if matches: match = matches[0] bill_type = match[0] bill_number = match[1] bill_name = "{} {}".format(bill_type, bill_number) agenda_item.add_bill(bill_name) event.add_participant( com, type="committee", note="host", ) event.add_source( "https://www.senate.gov/committees/hearings_meetings.htm") yield event
def house_meeting(self, xml, source_url): title = xml.xpath("string(//meeting-details/meeting-title)") meeting_date = xml.xpath("string(//meeting-date/calendar-date)") start_time = xml.xpath("string(//meeting-date/start-time)") end_time = xml.xpath("string(//meeting-date/end-time)") start_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, start_time), "%Y-%m-%d %H:%M:%S") start_dt = self._TZ.localize(start_dt) end_dt = None if end_time != "": end_dt = datetime.datetime.strptime( "{} {}".format(meeting_date, end_time), "%Y-%m-%d %H:%M:%S") end_dt = self._TZ.localize(end_dt) building = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/building)" ) address = "US Capitol" if building != "Select one": if self.buildings.get(building): building = self.buildings.get(building) room = xml.xpath( "string(//meeting-details/meeting-location/capitol-complex/room)" ) address = "{}, Room {}".format(building, room) event = Event(start_date=start_dt, name=title, location_name=address) event.add_source(source_url) coms = xml.xpath( "//committees/committee-name | //subcommittees/committee-name") for com in coms: com_name = com.xpath("string(.)") com_name = "House {}".format(com_name) event.add_participant( com_name, type="committee", note="host", ) docs = xml.xpath("//meeting-documents/meeting-document") for doc in docs: doc_name = doc.xpath("string(description)") doc_files = doc.xpath("files/file") for doc_file in doc_files: media_type = self.media_types[doc_file.get("doc-type")] url = doc_file.get("doc-url") if doc.get("type") in ["BR", "AM", "CA"]: if doc_name == "": doc_name = doc.xpath("string(legis-num)").strip() matches = re.findall(r"([\w|\.]+)\s+(\d+)", doc_name) if matches: match = matches[0] bill_type = match[0].replace(".", "") bill_number = match[1] bill_name = "{} {}".format(bill_type, bill_number) agenda = event.add_agenda_item(description=bill_name) agenda.add_bill(bill_name) if doc_name == "": try: doc_name = self.hearing_document_types[doc.get("type")] except KeyError: self.warning("Unable to find document type: {}".format( doc.get("type"))) event.add_document(doc_name, url, media_type=media_type, on_duplicate="ignore") yield event
def scrape_event_page(self, url, event_type): page = self.lxmlize(url) page.make_links_absolute("https://malegislature.gov/") title = page.xpath('string(//div[contains(@class,"followable")]/h1)') title = title.replace("Hearing Details", "").strip() title = title.replace("Special Event Details", "") start_day = page.xpath( '//dl[contains(@class,"eventInformation")]/dd[2]/text()[last()]' )[0].strip() start_time = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[3])').strip() # If an event gets moved, ignore the original time start_time = re.sub( r"Original Start Time(.*)New Start Time(\n*)", "", start_time, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) location = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[4]//a)' ).strip() if location == "": location = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[4])' ).strip() description = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[5])').strip() start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(start_day, start_time))) event = Event( start_date=start_date, name=title, location_name=location, description=description, ) event.add_source(url) agenda_rows = page.xpath( '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]' '/div/div/div[contains(@class,"panel-default")]') for row in agenda_rows: # only select the text node, not the spans agenda_title = row.xpath( "string(.//h4/a/text()[normalize-space()])").strip() if agenda_title == "": agenda_title = row.xpath( "string(.//h4/text()[normalize-space()])").strip() agenda = event.add_agenda_item(description=agenda_title) bills = row.xpath(".//tbody/tr/td[1]/a/text()") for bill in bills: bill = bill.strip().replace(".", " ") agenda.add_bill(bill) if event_type == "Hearing": event.add_participant(title, type="committee", note="host") yield event
def scrape_agenda(self, chamber, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf["DATE:"] time = metainf["TIME:"] where = metainf["PLACE:"] # check for duration in time if " - " in time: start, end = time.split(" - ") am_pm_srch = re.search("(?i)(am|pm)", end) if am_pm_srch: time = " ".join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M" ] event_desc = "Meeting Notice" if "Rise" in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper() or "CANCELED" in datetime.upper(): return if page.xpath("//span[@id='lblSession']"): event_desc = (page.xpath("//span[@id='lblSession']") [0].text_content().strip()) transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(name=event_desc, start_date=self._tz.localize(datetime), location_name=where) event.add_document("Agenda", url, media_type="text/html", on_duplicate="ignore") event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib["href"] event.add_document( bill.text_content(), bill_ft, media_type="application/pdf", on_duplicate="ignore", ) root = bill.xpath("../../*") root = [x.text_content() for x in root] bill_id = "".join(root).replace("\u00a0", "") if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().text_content().replace( "\u00a0", " ") for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill_id) # sometimes bill references are just plain links or plain text. bill_links = page.xpath('//a[contains(@href,"/BillText/")]/@href') linked_bills = set() for bill_link in bill_links: bill_nums = re.findall(r"\/(\w+\d+)\.pdf", bill_link, flags=re.IGNORECASE) for bill_num in bill_nums: linked_bills.add(bill_num) # sometimes (H 1234) ends up in the title or somewhere else unlinked text_bill_nums = re.findall(r"\((\w{1,3}\s?\d+)\)", page.text_content(), flags=re.IGNORECASE) for bill_num in text_bill_nums: bill_num = bill_num.replace(" ", "") linked_bills.add(bill_num) if len(linked_bills) != 0: item = event.add_agenda_item("Bills under consideration") for bill in linked_bills: item.add_bill(bill) if page.xpath("//span[@id='lblSession']"): committee = page.xpath( "//span[@id='lblSession']")[0].text_content() event.add_participant(committee, "committee", note="host") yield event
def scrape_chamber(self, chamber): session = self.latest_session() session_id = session_metadata.session_id_meta_data[session] chamber_abbr = self.chamber_codes[chamber] com_url = ( "https://apps.azleg.gov/api/Committee/?includeOnlyCommitteesWithAgendas=true" "&legislativeBody={}&sessionId={}&standingOnly=true&interimOnly=false&jointCommitteesOnly=false" ) com_url = com_url.format(chamber_abbr, session_id) coms = self.get(com_url).json() for com in coms: # joint committees get returned by both endpoints, so skip one if com["LegislativeBody"] != chamber_abbr: continue # https://apps.azleg.gov/api/Agenda/?showPassed=true&sessionId=123 # &isInterimAgenda=false&body=S&includeItems=false&committeeId=1960 events_url = ( "https://apps.azleg.gov/api/Agenda/?includeItems=true&showPassed=true" "&sessionId={}&isInterimAgenda=false&body={}&committeeId={}") events_url = events_url.format(session_id, chamber_abbr, com["CommitteeId"]) events_list = self.get(events_url).json() for row in events_list: if (row["AgendaCanceled"] is True or "not meeting" in row["Time"].lower()): continue title = "{} {}".format(self.code_chambers[chamber_abbr], row["CommitteeName"]) # fix for dateutil parser confusion row["Time"] = row["Time"].replace("A.M.", "AM").replace("P.M.", "PM") if "upon rec" not in row["Time"].lower(): time = re.findall(r"(\d+:\d+\s+[A|P]M)", row["Time"]) if len(time) == 0: self.warning( f"Unable to get time for {row['Time']} on {title}") time = "00:00:00" else: time = time[0] time = time.replace(r"\s+", " ") else: time = "" when = dateutil.parser.parse(f"{row['Date']} {time}") when = self._tz.localize(when) where = "{}, Room {}".format(self.address, row["Room"]) description = "" event = Event( name=title, location_name=where, start_date=when, description=description, ) event.add_document("Agenda", row["HttpPath"], media_type="text/html") event.add_document("Agenda", row["HttpPdfPath"], media_type="application/pdf") event.add_participant(row["CommitteeName"], type="committee", note="host") for item in row["Items"]: agenda_item = event.add_agenda_item(item["Description"]) bill_id = re.findall(r"^(.*?)\s", item["Description"]) bill_id = bill_id[0] agenda_item.add_bill(bill_id) for speaker in item["RequestsToSpeak"]: speaker_title = speaker["Name"] if speaker["Representing"] != "Self": speaker_title = ( f"{speaker['Name']} ({speaker['Representing']})" ) event.add_participant(speaker_title, type="person", note="speaker") event.add_source( "https://apps.azleg.gov/BillStatus/AgendaSearch") yield event
def scrape_committee_page(self, url): page = self.get(url, headers=self.cf_headers).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = page.xpath( '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip() for row in page.xpath('//div[contains(@id, "agenda-item")]'): # status = "tentative" meta = row.xpath( 'div[contains(@class,"accordion-heading-agenda")]/a')[0] date = meta.xpath("text()")[0].strip() time_and_loc = meta.xpath("span/text()")[0].strip() time_and_loc = time_and_loc.split("\n") time = time_and_loc[0] loc = time_and_loc[1] if loc == "": loc = "See Agenda" com = com.replace("(S)", "Senate").replace("(H)", "House") # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections" # so just remove the time component if it won't parse, and the user can go to the agenda try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) if "cancelled" in time.lower(): continue event = Event( name=com, start_date=when, location_name=loc, classification="committee-meeting", ) event.add_source(url) event.add_participant(com, type="committee", note="host") if row.xpath('.//a[contains(text(), "View Agenda")]'): agenda_url = row.xpath( './/a[contains(text(), "View Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath('.//a[contains(text(), "Watch")]'): vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0] event.add_media_link("Video of Hearing", vid_url, media_type="text/html") if row.xpath('.//tr[contains(@class,"bill-container")]/td'): agenda = event.add_agenda_item("Bills under consideration") for bill_row in row.xpath( './/tr[contains(@class,"bill-container")]'): bill_id = bill_row.xpath( ".//a[contains(@class,'bill-name-link')]/text()")[0] agenda.add_bill(bill_id) yield event
def scrape_event_page(self, url, chamber): html = self.get(url).text page = lxml.html.fromstring(html) trs = page.xpath( "//table[@id='frg_mcommitteemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = {"txt": val.text_content().strip(), "obj": val} if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf["Date"]["txt"], metainf["Time"]["txt"].replace(".", ""), ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM", # another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub(r"\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**", ]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace("p.m.", "pm") datetime = datetime.replace("Noon", "pm") try: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") except ValueError: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p") where = metainf["Location"]["txt"] title = metainf["Committee(s)"]["txt"] # XXX: Find a better title if chamber == "other": chamber = "joint" event = Event(name=title, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) event.add_source(mi_events) chair_name = metainf["Chair"]["txt"].strip() if chair_name: event.add_participant(chair_name, type="legislator", note="chair") else: self.warning("No chair found for event '{}'".format(title)) event.add_participant(metainf["Committee(s)"]["txt"], type="committee", note="host") agenda = metainf["Agenda"]["obj"] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a item = event.add_agenda_item(description) item.add_bill(bill.text_content()) yield event
def scrape_upper(self): listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm" html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split("<hr />")[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, "Date:") when_time = self.row_content(page, "Time:") when_time = re.sub("or upon .* recess", "", when_time) # fix for upon adjournment when_time = when_time.replace( "or upon morning adjournment whichever is later", "").strip() # 15/30/45 minutes/hours upon adjournment/recess when_time = re.sub(r"\d+ \w+ upon \w+", "", when_time, flags=re.IGNORECASE) # a.m. and p.m. seem to confuse dateutil.parser when_time = when_time.replace("A.M.", "AM").replace("P.M.", "PM") location = self.row_content(page, "Room:") location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") if not page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' ): continue com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(", Senator")[0].strip() try: start_date = dateutil.parser.parse(f"{when_date} {when_time}") except dateutil.parser._parser.ParserError: start_date = dateutil.parser.parse(when_date) start_date = self._TZ.localize(start_date) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant(com, type="committee", note="host") for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = "" if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath("string(tr[2])").strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath("string(tr[1])").strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def scrape_chamber(self, chamber, session): today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == "upper": chamber_abbrev = "S" else: chamber_abbrev = "H" url = ("http://www.legis.iowa.gov/committees/meetings/meetingsList" "Chamber?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % ( chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year, )) page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath("//div[contains(@class, 'meetings')]/table[1]/" "tbody/tr[not(contains(@class, 'hidden'))]"): comm = link.xpath("string(./td[2]/a[1]/text())").strip() desc = comm + " Committee Hearing" location = link.xpath("string(./td[3]/text())").strip() when = link.xpath("string(./td[1]/span[1]/text())").strip() if "cancelled" in when.lower() or "upon" in when.lower(): continue if "To Be Determined" in when: continue if "AM" in when: when = when.split("AM")[0] + " AM" else: when = when.split("PM")[0] + " PM" junk = ["Reception"] for key in junk: when = when.replace(key, "") when = re.sub(r"\s+", " ", when).strip() if "tbd" in when.lower(): # OK. This is a partial date of some sort. when = datetime.datetime.strptime(when, "%m/%d/%Y TIME - TBD %p") else: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") except ValueError: try: when = datetime.datetime.strptime( when, "%m/%d/%Y %I %p") except ValueError: self.warning("error parsing timestamp %s", when) continue event = Event( name=desc, description=desc, start_date=self._tz.localize(when), location_name=location, ) event.add_source(url) event.add_participant(comm, note="host", type="committee") yield event
def lower_parse_page(self, url): page = self.lxmlize(url) tables = page.xpath("//table[@class='pubhrgtbl']") date = None for table in tables: metainf = {} rows = table.xpath(".//tr") for row in rows: tds = row.xpath("./*") if len(tds) < 2: continue key, value = tds if key.tag == "th" and key.get("class") == "hrgdate": date = key.text_content() date = re.sub(r"\s+", " ", date) date = re.sub(".*POSTPONED NEW DATE", "", date).strip() # Due to the html structure this shouldn't be an elif # It needs to fire twice in the same loop iteration if value.tag == "th" and value.get("class") == "commtitle": coms = value.xpath( './/div[contains(@class,"comm-txt")]/text()') elif key.tag == "td": key = key.text_content().strip() value = value.text_content().strip() value = value.replace(u"\x96", "-") value = re.sub(r"\s+", " ", value) metainf[key] = value time = metainf["Time:"] repl = {"A.M.": "AM", "P.M.": "PM"} drepl = {"Sept": "Sep"} for r in repl: time = time.replace(r, repl[r]) for r in drepl: date = date.replace(r, drepl[r]) time = re.sub("-.*", "", time) time = time.strip() year = dt.datetime.now().year date = "%s %s %s" % (date, year, time) if "tbd" in date.lower(): continue date = date.replace(" PLEASE NOTE NEW TIME", "") # Check if the event has been postponed. postponed = "POSTPONED" in date if postponed: date = date.replace(" POSTPONED", "") date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"] datetime = None for fmt in date_formats: try: datetime = dt.datetime.strptime(date, fmt) except ValueError: pass # If the datetime can't be parsed, bail. if datetime is None: return title_key = set(metainf) & set([ "Public Hearing:", "Summit:", "Roundtable:", "Public Roundtable:", "Public Meeting:", "Public Forum:", "Meeting:", ]) assert len(title_key) == 1, "Couldn't determine event title." title_key = list(title_key).pop() title = metainf[title_key] title = re.sub(r"\*\*Click here to view public hearing notice\*\*", "", title) # If event was postponed, add a warning to the title. if postponed: title = "POSTPONED: %s" % title event = Event( name=title, start_date=self._tz.localize(datetime), location_name=metainf["Place:"], ) event.extras = {"contact": metainf["Contact:"]} if "Media Contact:" in metainf: event.extras.update(media_contact=metainf["Media Contact:"]) event.add_source(url) for com in coms: event.add_participant(com.strip(), type="committee", note="host") participant = event.participants[-1] participant["extras"] = ({ "chamber": self.classify_committee(com) }, ) yield event
def scrape(self): url = "https://apps.legislature.ky.gov/legislativecalendar" page = self.get(url).content page = lxml.html.fromstring(page) for time_row in page.xpath( '//div[contains(@class,"TimeAndLocation")]'): date = (time_row.xpath( 'preceding-sibling::div[contains(@class,"DateHeading")][1]') [0].text_content().strip()) status = "tentative" if time_row.xpath('div[contains(@class,"Cancelled")]'): status = "cancelled" row_text = time_row.text_content() row_text = row_text.replace("Noon", "PM") # upon recess (of House|Senate) row_text = re.sub(r"Upon Recess(\sof\s)?(House|Senate)?", "", row_text) parts = re.split(r",|AM|PM", row_text) time = parts[0].strip() location = " ".join( x.replace(r"\xa0", "").strip() for x in parts[1:]) when = f"{date} {time}" when = dateutil.parser.parse(when) when = self._tz.localize(when) if not time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a' ): continue com_name = (time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a' )[0].text_content().strip()) event = Event( name=com_name, start_date=when, classification="committee-meeting", location_name=location, status=status, ) if time_row.xpath( 'following-sibling::div[contains(@class,"Agenda")][1]'): agenda_row = time_row.xpath( 'following-sibling::div[contains(@class,"Agenda")][1]')[0] agenda_text = agenda_row.text_content().strip() agenda = event.add_agenda_item(agenda_text) for bill_link in agenda_row.xpath( './/a[contains(@href,"/record/")]'): agenda.add_bill(bill_link.text_content().strip()) event.add_participant(com_name, note="host", type="committee") com_page_link = time_row.xpath( 'following-sibling::div[contains(@class,"CommitteeName")][1]/a/@href' )[0] docs = self.scrape_com_docs(com_page_link) lookup_date = when.strftime("%Y-%m-%d") if lookup_date in docs["mats"]: for mat in docs["mats"][lookup_date]: event.add_document(mat["text"], mat["url"], on_duplicate="ignore") if lookup_date in docs["minutes"]: for mat in docs["minutes"][lookup_date]: event.add_document(mat["text"], mat["url"], on_duplicate="ignore") event.add_source(url) yield event
def scrape_house_weekly_schedule(self): url = "https://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [ row for row in meeting_rows if row.xpath("./td[1]")[0].text_content().replace("\xa0", "") and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]') and "Not Meeting" not in row.xpath("./td[2]")[0].text_content() ] for meeting in valid_meetings: try: guid = meeting.xpath("./td/a[descendant::img[contains(@src," '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath("./td[1]/text()")[0].strip() meeting_string = meeting.xpath("./td[2]")[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ( [s.strip() for s in meeting_string.split(",") if s] + [None] * 3)[:3] # check for time in date because of missing comma time_srch = re.search(r"\d{2}:\d{2} (AM|PM)", date) if time_srch: location = time time = time_srch.group() date = date.replace(time, "") # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = " ".join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, "%b %d %Y %I:%M %p") when = self._tz.localize(when) description = "Committee Meeting: {}".format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event( name=description, start_date=self._tz.localize(when), location_name=location, ) event.add_source(url) event.add_participant(committee_name, type="committee", note="host") event.add_document(note="Agenda", url=guid, text="agenda", media_type="application/pdf") yield event
def scrape_meeting(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//a[@id='linkTitle']//text()")[0] date = page.xpath("//span[@id='lDate']/text()")[0] time = page.xpath("//span[@id='lTime']/text()")[0] location = page.xpath("//span[@id='lLocation']/text()")[0] substs = {"AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"]} for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r"(?i)\d[AP]M$", time): time = time[:-2] + " " + time[-2:] if re.search("UPON ADJ|TBA", " ".join(time.split()).upper()): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = dateutil.parser.parse(f"{date} {time}".strip()) # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = { "house": "lower", "senate": "upper", "joint": "legislature" } for chamber_ in chambers: if chamber_ in title.lower(): break else: return event = Event( name=description, start_date=self._tz.localize(when), location_name=location, all_day=all_day, ) event.add_source(url) event.add_participant(title, note="host", type="committee") trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title or "H" in bill_title: item = event.add_agenda_item(descr.text_content()) item.add_bill(bill_title) else: continue yield event
def scrape_chamber(self, chamber): if chamber == "upper": url = "https://legislature.idaho.gov/sessioninfo/agenda/sagenda/" elif chamber == "lower": url = "https://legislature.idaho.gov/sessioninfo/agenda/hagenda/" page = self.get(url).content page = lxml.html.fromstring(page) for row in page.xpath('//div[@id="ai1ec-container"]/div'): month = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/text()" )[0].strip() day = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'date')]/span/text()" )[0].strip() time_and_loc = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'abbr')]/h2/text()" ) time = time_and_loc[0].strip() loc = time_and_loc[1].strip() if "not meet" in time.lower(): continue try: start = dateutil.parser.parse(f"{month} {day} {time}") except dateutil.parser._parser.ParserError: start = dateutil.parser.parse(f"{month} {day}") start = self._tz.localize(start) com = row.xpath( ".//div[contains(@class,'calendarHeader')]/div[contains(@class,'day')]/h2/a/text()" )[0].strip() event = Event( name=com, start_date=start, location_name=loc, classification="committee-meeting", ) event.add_participant(com, type="committee", note="host") agenda_url = row.xpath( './/a[contains(text(), "Full Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") agenda_rows = row.xpath( './/div[contains(@class,"card")]/div[contains(@id, "Agenda")]/div/table/tbody/tr' )[1:] for agenda_row in agenda_rows: subject = agenda_row.xpath("string(td[1])").strip() description = agenda_row.xpath("string(td[2])").strip() presenter = agenda_row.xpath("string(td[3])").strip() if presenter != "": agenda_text = ( f"{subject} {description} Presenter: {presenter}". strip()) event.add_participant(agenda_text, type="person", note="Presenter") else: agenda_text = f"{subject} {description}".strip() agenda = event.add_agenda_item(agenda_text) if agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]'): agenda.add_bill( agenda_row.xpath( 'td[1]/a[contains(@href,"/legislation/")]/text()') [0].strip()) event.add_source(url) yield event
def scrape_lower(self): list_url = ( "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php" ) page = self.get(list_url).content page = lxml.html.fromstring(page) page.make_links_absolute(list_url) for row in page.xpath("//table[contains(@class, 'CODayTable')]/tbody/tr"): # TODO: it would be nice to go back in and update the record to mark it as cancelled, # but since there's no ics link it makes the day logic way more complicated if row.xpath(".//span[contains(@class, 'COCancelled')]"): continue # fallback for unlinked events source = ( "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php" ) if row.xpath(".//a[1]/text()"): title = row.xpath(".//a[1]/text()")[0].strip() source = row.xpath(".//a[1]/@href")[0] event_type = "committee-meeting" else: # skip unlinked misc events if row.xpath("td[contains(@class, 'COCommType')]/text()"): title = row.xpath("td[contains(@class, 'COCommType')]/text()")[ 0 ].strip() event_type = "other" else: continue date_link = row.xpath(".//a[@title='Add to Calendar']/@href")[0] parsed = parse.parse_qs(parse.urlparse(date_link).query) date_raw = parsed["dt"][0] location = parsed["loc"][0] start = dateutil.parser.parse(date_raw, tzinfos=self.tzinfos) # If there's a chair in parentheticals, remove them from the title # and add as a person instead chair_note = re.findall(r"\(.*\)", title) chair = None for chair_str in chair_note: title = title.replace(chair_str, "").strip() # drop the outer parens chair = chair_str[1:-1] event = Event( name=title, start_date=start, location_name=location, classification=event_type, ) event.add_source(source) if chair is not None: event.add_participant(chair, type="person", note="chair") if event_type == "committee-meeting": event.add_participant(title, type="committee", note="host") if row.xpath(".//a[contains(@class,'COAgendaLink')]"): agenda_url = row.xpath(".//a[contains(@class,'COAgendaLink')]/@href")[0] event.add_document("Agenda", agenda_url, media_type="text/html") self.scrape_lower_agenda(event, agenda_url) yield event