def scrape_upper(self): url = "https://www.senate.mn/api/schedule/upcoming" data = self.get(url).json() for row in data["events"]: com = row["committee"]["committee_name"] start = dateutil.parser.parse(row["hearing_start"]) start = self._tz.localize(start) if (row["hearing_room"] and "hearing_building" in row and row["hearing_building"]): where = f"{row['hearing_building']} {row['hearing_room']}" elif "hearing_building" in row and row["hearing_building"]: where = row["hearing_building"] else: where = "TBD" description = "" if "hearing_notes" in row and row["hearing_notes"]: description = row["hearing_notes"] event = Event( name=com, location_name=where, start_date=start, classification="committee-meeting", description=description, ) for bill in get_bill_ids(description): event.add_bill(description) if "lrl_schedule_link" in row: event.add_source(row["lrl_schedule_link"]) else: if "link" in row["committee"]: if row["committee"]["link"].startswith("http"): event.add_source(row["committee"]["link"]) elif row["committee"]["link"].startswith("www"): event.add_source(f"http://{row['committee']['link']}") else: event.add_source( f"https://www.senate.mn/{row['committee']['link']}" ) elif "senate_chair_link" in row["committee"]: event.add_source( f"https://www.senate.mn/{row['committee']['senate_chair_link']}" ) if "agenda" in row: for agenda_row in row["agenda"]: if (agenda_row["description"] is None or agenda_row["description"].strip() == ""): # sometimes they have blank agendas but bills or files agenda_row["description"] = "Agenda" agenda = event.add_agenda_item(agenda_row["description"]) if "bill_type" in agenda_row: agenda.add_bill("{} {}".format( agenda_row["bill_type"].replace(".", ""), agenda_row["bill_number"], )) if "files" in agenda_row: for file_row in agenda_row["files"]: doc_name = file_row["filename"] doc_url = file_row["file_path"] # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url.path) event.add_document( doc_name, f"https://www.senate.mn/{doc_url}", media_type="text/html", on_duplicate="ignore", ) if "video_link" in row: event.add_media_link("Video", row["video_link"], "text/html") if "audio_link" in row: event.add_media_link("Audio", row["audio_link"], "text/html") yield event
def scrape_lower(self): url = "https://www.house.leg.state.mn.us/Schedules/All" page = self.lxmlize(url) for row in page.xpath('//div[contains(@class,"my-2 d-print-block")]'): # print(row.text_content()) # skip floor sessions and unlinked events if not row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b' ): continue # skip joint ones, we'll get those from the senate API if row.xpath('div[contains(@class,"card-header bg-joint")]'): continue # top-level committee com = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' )[0].strip() com_link = row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/@href' )[0] when = (row.xpath( 'div[contains(@class,"card-header")]/span[contains(@class,"text-white")]/text()' )[0].replace("\r\n", "").strip()) when = dateutil.parser.parse(when) when = self._tz.localize(when) if row.xpath('.//b[.="Location:"]'): where = row.xpath( './/b[.="Location:"]/following-sibling::text()[1]' )[0].strip() else: where = "See committee page" if row.xpath('.//b[.="Agenda:"]'): desc = "\n".join( row.xpath('.//b[.="Agenda:"]/following-sibling::div/text()' )).strip() else: desc = "See committee page" event = Event( name=com, start_date=when, location_name=where, classification="committee-meeting", description=desc, ) event.add_source(com_link) for bill in get_bill_ids(desc): event.add_bill(desc) if row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]" ): agenda = event.add_agenda_item("Bills") for bill_id in row.xpath( ".//a[contains(@href,'/bills/bill.php') and contains(@class,'pull-left')]/text()" ): agenda.add_bill(bill_id.strip()) for attachment in row.xpath(".//ul/li/div/a"): doc_url = attachment.xpath("@href")[0] doc_name = attachment.xpath("text()")[0].strip() # if they don't provide a name just use the filename if doc_name == "": parsed_url = urlparse(doc_url) doc_name = os.path.basename(parsed_url) # sometimes broken links to .msg files (emails?) are attached, # they always 404. if doc_url.endswith(".msg"): continue media_type = get_media_type(doc_url) event.add_document(doc_name, doc_url, media_type=media_type, on_duplicate="ignore") for committee in row.xpath( 'div[contains(@class,"card-header")]/h3/a[contains(@class,"text-white")]/b/text()' ): event.add_participant(committee, type="committee", note="host") yield event
def scrape(self, chamber=None, session=None): """ Scrape the events data from all dates from the sc meetings page, then create and yield the events objects from the data. :param chamber: :param session: :return: yielded Event objects """ chambers = { "upper": {"name": "Senate", "title": "Senator"}, "lower": {"name": "House", "title": "Representative"}, } if chamber == "other": return if chamber is None: self.info("no chamber specified, using Joint Committee Meeting Schedule") events_url = "http://www.scstatehouse.gov/meetings.php" else: events_url = "http://www.scstatehouse.gov/meetings.php?chamber=%s" % ( chambers[chamber]["name"].upper()[0] ) page = self.get_page_from_url(events_url) meeting_year = page.xpath('//h2[@class="barheader"]/span')[0].text_content() meeting_year = re.search( r"Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})", meeting_year ).group(1) dates = page.xpath("//div[@id='contentsection']/ul") for date in dates: date_string = date.xpath("span") if len(date_string) == 1: date_string = date_string[0].text_content() else: continue # If a event is in the next calendar year, the date_string # will have a year in it if date_string.count(",") == 2: event_year = date_string[-4:] date_string = date_string[:-6] elif date_string.count(",") == 1: event_year = meeting_year else: raise AssertionError("This is not a valid date: '{}'").format( date_string ) for meeting in date.xpath("li"): time_string = meeting.xpath("span")[0].text_content() if ( time_string == "CANCELED" or len(meeting.xpath('.//span[contains(text(), "CANCELED")]')) > 0 ): continue time_string = normalize_time(time_string) date_time = datetime.datetime.strptime( event_year + " " + date_string + " " + time_string, "%Y %A, %B %d %I:%M %p", ) date_time = self._tz.localize(date_time) meeting_info = meeting.xpath("br[1]/preceding-sibling::node()")[1] location, description = re.search( r"-- (.*?) -- (.*)", meeting_info ).groups() # if re.search(r'committee', description, re.I): # meeting_type = 'committee:meeting' # else: # meeting_type = 'other:meeting' event = Event( name=description, # Event Name start_date=date_time, # When the event will take place location_name=location, ) # Where the event will be event.add_source(events_url) agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]") if agenda_url: agenda_url = agenda_url[0].attrib["href"] event.add_source(agenda_url) event.add_document( note="Agenda", url=agenda_url, media_type="application/pdf" ) agenda_page = self.get_page_from_url(agenda_url) for bill in agenda_page.xpath( ".//a[contains(@href,'billsearch.php')]" ): # bill_url = bill.attrib['href'] bill_id = bill.text_content().replace(".", "").replace(" ", "") # bill_description = self.get_bill_description(bill_url) event.add_bill(bill_id) yield event