def scrape(self, start=None): if start is None: start = datetime.datetime.today() else: start = datetime.datetime.strptime(start, "%Y-%m-%d") date_format = "%a %b %d %Y" date_slug = start.strftime(date_format) url = f"https://www.legis.ga.gov/api/meetings?startDate={date_slug}" page = self.get(url).json() for row in page: status = "tentative" title = row["subject"] if "joint" not in title.lower(): if row["chamber"] == 2: title = f"Senate {title}" elif row["chamber"] == 1: title = f"House {title}" start = dateutil.parser.parse(row["start"]) if start < self.tz.localize(datetime.datetime.now()): status = "passed" if "cancelled" in title.lower() or "canceled" in title.lower(): status = "cancelled" # try to replace all variants of "[optional dash] cancel[l]ed [optional dash]" # so we can match up events to their pre-cancellation occurrence title = re.sub(r"-?\s*cancell?ed\s*-?\s*", " ", title, flags=re.I) where = row["location"] where = f"206 Washington St SW, Atlanta, Georgia, {where}" event = Event( name=title, start_date=start, location_name=where, classification="committee-meeting", status=status, ) if row["agendaUri"] != "": event.add_document( "Agenda", row["agendaUri"], media_type="application/pdf" ) if row["livestreamUrl"] is not None: event.add_media_link( "Video", row["livestreamUrl"], media_type="text/html" ) event.add_source("https://www.legis.ga.gov/schedule/all") yield event
def scrape_upper_com(self, url, com, session): url = f"{url}{session}" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = f"Senate {com}" for row in page.xpath('//table[@id="meetingsTbl"]/tbody/tr'): day = row.xpath("td[1]")[0].text_content().strip() time = row.xpath("td[2]")[0].text_content().strip() notice = row.xpath("td[3]")[0].text_content().strip() location = "See Agenda" # it's in the PDFs but not the web page date = dateutil.parser.parse(f"{day} {time}") date = self.tz.localize(date) if notice.lower() == "not meeting" or "cancelled" in notice.lower( ): continue event = Event(name=com, start_date=date, location_name=location) agenda_classes = [ "mtgrecord_notice", "mtgrecord_expandedAgenda", "mtgrecord_attendance", ] for agenda_class in agenda_classes: if row.xpath(f"//a[@class='{agenda_class}']"): url = row.xpath(f"//a[@class='{agenda_class}']/@href")[0] doc_name = (row.xpath(f"//a[@class='{agenda_class}']") [0].text_content().strip()) event.add_document(doc_name, url, media_type="application/pdf") for link in row.xpath("td[7]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "audio/mpeg") for link in row.xpath("td[9]/a"): url = link.xpath("@href")[0] doc_name = link.text_content().strip() event.add_media_link(doc_name, url, "text/html") event.add_source(url) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event(start_date=start_date, end_date=end_date, name=title, location_name=location) event.add_source( "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx") for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath("@href")[0], media_type="application/pdf", on_duplicate="ignore", ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link("Video of Hearing", video[0].xpath("@href")[0], "text/html") if "subcommittee" in title.lower(): subcom = title.split("-")[0].strip() event.add_participant(subcom, type="committee", note="host") else: event.add_participant(com, type="committee", note="host") yield event
def scrape_committee_page(self, url): page = self.get(url, headers=self.cf_headers).content page = lxml.html.fromstring(page) page.make_links_absolute(url) com = page.xpath( '//div[contains(@class, "pull-left span8")]/h1/text()')[0].strip() for row in page.xpath('//div[contains(@id, "agenda-item")]'): # status = "tentative" meta = row.xpath( 'div[contains(@class,"accordion-heading-agenda")]/a')[0] date = meta.xpath("text()")[0].strip() time_and_loc = meta.xpath("span/text()")[0].strip() time_and_loc = time_and_loc.split("\n") time = time_and_loc[0] loc = time_and_loc[1] if loc == "": loc = "See Agenda" com = com.replace("(S)", "Senate").replace("(H)", "House") # Indiana has a LOT of undefined times, stuff like "15 mins after adj. of elections" # so just remove the time component if it won't parse, and the user can go to the agenda try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) if "cancelled" in time.lower(): continue event = Event( name=com, start_date=when, location_name=loc, classification="committee-meeting", ) event.add_source(url) event.add_participant(com, type="committee", note="host") if row.xpath('.//a[contains(text(), "View Agenda")]'): agenda_url = row.xpath( './/a[contains(text(), "View Agenda")]/@href')[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath('.//a[contains(text(), "Watch")]'): vid_url = row.xpath('.//a[contains(text(), "Watch")]/@href')[0] event.add_media_link("Video of Hearing", vid_url, media_type="text/html") if row.xpath('.//tr[contains(@class,"bill-container")]/td'): agenda = event.add_agenda_item("Bills under consideration") for bill_row in row.xpath( './/tr[contains(@class,"bill-container")]'): bill_id = bill_row.xpath( ".//a[contains(@class,'bill-name-link')]/text()")[0] agenda.add_bill(bill_id) yield event
def scrape_upper(self): url = "https://www.senate.mn/api/schedule/upcoming" data = self.get(url).json() for row in data["events"]: com = row["committee"]["committee_name"] start = dateutil.parser.parse(row["hearing_start"]) start = self._tz.localize(start) if (row["hearing_room"] and "hearing_building" in row and row["hearing_building"]): where = f"{row['hearing_building']} {row['hearing_room']}" elif "hearing_building" in row and row["hearing_building"]: where = row["hearing_building"] else: where = "TBD" description = "" if "hearing_notes" in row and row["hearing_notes"]: description = row["hearing_notes"] event = Event( name=com, location_name=where, start_date=start, classification="committee-meeting", description=description, ) for bill in get_bill_ids(description): event.add_bill(description) if "lrl_schedule_link" in row: event.add_source(row["lrl_schedule_link"]) else: if "link" in row["committee"]: if row["committee"]["link"].startswith("http"): event.add_source(row["committee"]["link"]) elif row["committee"]["link"].startswith("www"): event.add_source(f"http://{row['committee']['link']}") else: event.add_source( f"https://www.senate.mn/{row['committee']['link']}" ) elif "senate_chair_link" in row["committee"]: event.add_source( f"https://www.senate.mn/{row['committee']['senate_chair_link']}" ) if "agenda" in row: for agenda_row in row["agenda"]: if (agenda_row["description"] is None or agenda_row["description"].strip() == ""): # sometimes they have blank agendas but bills or files agenda_row["description"] = "Agenda" agenda = event.add_agenda_item(agenda_row["description"]) if "bill_type" in agenda_row: agenda.add_bill("{} {}".format( agenda_row["bill_type"].replace(".", ""), agenda_row["bill_number"], )) if "files" in agenda_row: for file_row in agenda_row["files"]: doc_name = file_row["filename"] doc_url = file_row["file_path"] # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url.path) event.add_document( doc_name, f"https://www.senate.mn/{doc_url}", media_type="text/html", on_duplicate="ignore", ) if "video_link" in row: event.add_media_link("Video", row["video_link"], "text/html") if "audio_link" in row: event.add_media_link("Audio", row["audio_link"], "text/html") yield event
def scrape(self, start=None, end=None): if start is None: start_date = datetime.datetime.now().strftime(self.date_format) # default to 90 days if no end if end is None: dtdelta = datetime.timedelta(days=90) end_date = datetime.datetime.now() + dtdelta end_date = end_date.strftime(self.date_format) url = f"https://www.arkleg.state.ar.us/Calendars/Meetings?tbType=&meetingStartDate={start_date}&meetingEndDate={end_date}" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath( "//div[@id='meetingBodyWrapper']/div[contains(@class,'row')]"): row_class = row.xpath("@class")[0] if "tableSectionHeader" in row_class: day = row.xpath("div/text()")[0].strip() continue time = row.xpath( "div[contains(@class,'timeRow')]/b/text()")[0].strip() if "no meeting" in time.lower() or "cancelled" in time.lower(): continue if "upon adjournment" in time.lower(): time = "1:00 PM" title = row.xpath("div[2]/b")[0].text_content().strip() if "call of the chair" in time.lower(): time = "" else: times = re.findall(r"\d+:\d+\s*[A|P]M", time) time = times[0] when = dateutil.parser.parse(f"{day} {time}") when = self._tz.localize(when) location = row.xpath("div[2]/text()")[1].strip() event = Event( name=title, start_date=when, location_name=location, description="", ) event.add_source( "https://www.arkleg.state.ar.us/Calendars/Meetings") if row.xpath(".//a[@aria-label='Agenda']"): agenda_url = row.xpath(".//a[@aria-label='Agenda']/@href")[0] event.add_document("Agenda", agenda_url, media_type="application/pdf") if row.xpath(".//a[@aria-label='Play Video']"): video_url = row.xpath( ".//a[@aria-label='Play Video']/@href")[0] event.add_media_link("Video of Hearing", video_url, media_type="text/html") if row.xpath(".//a[@aria-label='Referred']"): bill_url = row.xpath(".//a[@aria-label='Referred']/@href")[0] self.scrape_referred_bills(event, bill_url) yield event
def scrape(self, chamber=None): url = "https://le.utah.gov/CalServ/CalServ?month={}&year={}" year = datetime.datetime.today().year for i in range(0, 12): page = self.get(url.format(i, year)).json() if "days" in page: for day_row in page["days"]: for row in day_row["events"]: # ignore 'note', 'housefloor', 'senatefloor' if row["type"] == "meeting": status = "tentative" title = row["desc"] where = row["location"] when = dateutil.parser.parse( f"{day_row['year']}-{str(int(day_row['month'])+1)}-{day_row['day']} {row['time']}" ) when = self._tz.localize(when) if "status" in row and row["status"] == "C": status = "cancelled" event = Event( name=title, location_name=where, start_date=when, classification="committee-meeting", status=status, ) if "agenda" in row: event.add_document( "Agenda", f"{self.base_url}{row['agenda']}", media_type="text/html", on_duplicate="ignore", ) if "minutes" in row: event.add_document( "Minutes", f"{self.base_url}{row['minutes']}", media_type="text/html", on_duplicate="ignore", ) if "mediaurl" in row: event.add_media_link( "Media", f"{self.base_url}{row['mediaurl']}", media_type="text/html", on_duplicate="ignore", ) if re.findall(r"mtgID=(\d+)", row["mediaurl"]): hearing_id = re.findall( r"mtgID=(\d+)", row["mediaurl"])[0] docs_url = f"https://glen.le.utah.gov/committees/meeting/{hearing_id}/1234" docs_page = self.get(docs_url).json() if "meetingMaterials" in docs_page: for mat in docs_page[ "meetingMaterials"]: agenda = event.add_agenda_item( mat["description"]) event.add_document( mat["description"], f"{self.base_url}{mat['docUrl']}", media_type="application/pdf", on_duplicate="ignore", ) for bill_row in re.findall( r"(\w{2,3}\d{4})", mat["description"]): agenda.add_bill(bill_row) # NOTE: The following data appears to be duped on the meetingMaterials endpoint # but leaving this in place commented out, in case that ever changes. # # rather than return an empty object this page just times out if there are no bills # so don't retry, and pass on failure # bills_url = f"https://glen.le.utah.gov/agencal/{hearing_id}/1234" # self.retry_attempts = 0 # try: # bills_page = self.get(bills_url, timeout=3).json() # if 'agendaitems' in bills_page: # for bill_row in bills_page['agendaitems']: # agenda = event.add_agenda_item(bill_row['description']) # if 'bill' in bill_row: # agenda.add_bill(bill_row['bill']) # print(bill_row) # except requests.exceptions.ReadTimeout: # pass # then reset the retry attempts to normal for other requests self.retry_attempts = 3 source_url = f"{self.base_url}{row['itemurl']}" event.add_source(source_url) yield event
def scrape(self): today = datetime.datetime.today() url = "https://web.wyoleg.gov/LsoService/api/Calendar/Events/{}{}01" # this month and the next 2 months for add in [0, 1, 2]: test_date = today + relativedelta.relativedelta(months=+add) month_url = url.format(str(test_date.year), str(test_date.month).zfill(2)) page = self.get(month_url).json() for row in page: if row["meetingKind"] == 2: com = f"{row['meetingType']} {row['committee']['fullName']}" # skip state holidays or other non-committee hearings if com.strip() == "": continue start = parser.parse(row["startDate"]) start = self._tz.localize(start) end = parser.parse(row["endTime"]) end = self._tz.localize(end) where = row["address1"] if where == "": where = "TBD" desc = row["purpose"] event = Event( name=com, location_name=where, start_date=start, end_date=end, classification="committee-meeting", description=desc, ) for media in row["meetingMedias"]: # all these i've seen say they're octet stream but are actually youtube links event.add_media_link( media["documentType"], media["filePath"], "text/html", on_duplicate="ignore", ) for doc in row["meetingDocuments"]: event.add_document( doc["title"], f"{self.base_url}{doc['documentUrl']}", on_duplicate="ignore", ) for item in row["meetingAgendas"]: self.parse_agenda_item(event, item) bills_agenda_item = None for bill in row["sessionMeetingBills"]: if bills_agenda_item is None: bills_agenda_item = event.add_agenda_item( "Bills under Consideration") bills_agenda_item.add_bill(bill["billNumber"]) web_url = "https://www.wyoleg.gov/Calendar/{year}{month}01/Meeting?type=committee&id={meeting_id}" web_url = web_url.format( year=str(test_date.year), month=str(test_date.month).zfill(2), meeting_id=row["id"], ) event.add_source(web_url) yield event
def scrape_chamber(self, chamber, session): today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == "upper": chamber_abbrev = "S" else: chamber_abbrev = "H" url = ( "http://www.legis.iowa.gov/committees/meetings/meetingsList" "Chamber?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % ( chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year, ) ) page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath( "//div[contains(@class, 'meetings')]/table[1]/" "tbody/tr[not(contains(@class, 'hidden'))]" ): comm = None desc = None pretty_name = None status = "tentative" comm = link.xpath("string(./td[2]/a[1]/span/text())").strip() if comm == "": comm = link.xpath("string(./td[2]/a[1]/text())").strip() desc = comm + " Committee Hearing" location = link.xpath("string(./td[3]/span/text())").strip() if location == "": location = link.xpath("string(./td[3]/text())").strip() when = link.xpath("string(./td[1]/span[1]/text())").strip() if when == "": when = link.xpath("string(./td[1]/text())").strip() if "cancelled" in when.lower() or "upon" in when.lower(): status = "cancelled" if "To Be Determined" in when: continue # sometimes they say cancelled, sometimes they do a red strikethrough if link.xpath("./td[1]/span[contains(@style,'line-through')]"): status = "cancelled" if "cancelled" in link.xpath("@class")[0]: status = "cancelled" junk = ["Reception"] for key in junk: when = when.replace(key, "") pretty_name = f"{self.chambers[chamber]} {desc}" when = re.sub(r"\s+", " ", when).strip() if "tbd" in when.lower(): # OK. This is a partial date of some sort. when = datetime.datetime.strptime(when, "%m/%d/%Y TIME - TBD %p") else: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") except ValueError: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p") except ValueError: self.warning(f"error parsing timestamp {when} on {pretty_name}") continue event = Event( name=pretty_name, description=desc, start_date=self._tz.localize(when), location_name=location, status=status, ) if link.xpath("td[4]/span/a"): video_link = link.xpath("td[4]/span/a/@href")[0] event.add_media_link("Video of Hearing", video_link, "text/html") if status != "cancelled" and link.xpath('.//a[contains(text(),"Agenda")]'): agenda_rows = link.xpath( 'following-sibling::tr[1]/td/div[contains(@class,"agenda")]/p' ) for agenda_row in agenda_rows: agenda_text = agenda_row.xpath("string(.)") if agenda_text.strip() != "": agenda = event.add_agenda_item(agenda_text) for bill_row in agenda_row.xpath( './/a[contains(@href, "/BillBook")]/text()' ): agenda.add_bill(bill_row) event.add_source(url) event.add_participant(comm, note="host", type="committee") yield event