def scrape_upper(self): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split("MEETING NOTICES") re_date = r"[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}" chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r"^\s*TIME:\s+(.+?)\s+\x96", data, re.M).group(1) time_ = time_.replace("a.m.", "AM").replace("p.m.", "PM") time_ = time.strptime(time_, "%I:%M %p") when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r"^\s*PLACE:\s+(.+)", data, re.M).group(1) where = where.strip() event = Event(name=title, start_date=self._tz.localize(when), location_name=where) event.add_source(url) yield event
def scrape_committee_events(self, code, name): events_url = ( "http://www.cga.ct.gov/basin/fullcalendar/commevents.php?" "comm_code={}".format(code)) events_data = self.get(events_url, verify=False).text events = json.loads(events_data) DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" for info in events: if info["title"] is None: self.warning("Event found with no title; it will be skipped") continue elif info["title"].startswith("CANCELLED:"): self.info( "Cancelled event found; it will be skipped: {}".format( info["title"])) continue when = datetime.datetime.strptime(info["start"], DATETIME_FORMAT) # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT) where = "{0} {1}".format(info["building"].strip(), info["location"].strip()) # end_time=self._tz.localize(end), event = Event( start_date=self._tz.localize(when), location_name=where, name=info["title"], description=info["title"], ) event.add_source(events_url) yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item["CommitteeName"]) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt) location_name = str(item["AddressAliasNickname"]) event = Event( location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description="Committee Meeting Status: {}".format( item["CommitteeMeetingStatusName"]), ) event.add_source(url) event.add_committee(name=str(item["CommitteeName"]), id=item["CommitteeId"]) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item["CommitteeMeetingId"])) event.add_source(page_url) page_data = self.post(page_url).json()["Data"] for item in page_data: event.add_agenda_item(description=str(item["ItemDescription"])) event.add_person( name=str(item["PrimarySponsorShortName"]), id=str(item["PrimarySponsorPersonId"]), note="Sponsor", ) yield event
def scrape_lower_event(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//section[@id="leg-agenda-mod"]/div/table')[0] meta = table.xpath("tr[1]/td[1]/text()") # careful, the committee name in the page #committee_div # is getting inserted via JS # so use the one from the table, and strip the chair name com_name = re.sub(r"\(.*\)", "", meta[0]) com_name = f"Assembly {com_name}" when = dateutil.parser.parse(meta[1]) when = self._tz.localize(when) location = meta[2] event = Event( name=com_name, start_date=when, location_name=location, ) event.add_participant(com_name, type="committee", note="host") event.add_source(url) if table.xpath('.//a[contains(@href, "/leg/")]'): agenda = event.add_agenda_item("Bills under Consideration") for bill_link in table.xpath('.//a[contains(@href, "/leg/")]'): agenda.add_bill(bill_link.text_content().strip()) yield event
def scrape(self, chamber=None): # we need to GET the page once to set up the ASP.net vars # then POST to it to set it to monthly url = "https://www.okhouse.gov/Committees/MeetingNotices.aspx" params = { "__EVENTTARGET": "ctl00$ContentPlaceHolder1$cbMonthly", "ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctl00$ContentPlaceHolder1$RadAjaxPanel1Panel|ctl00$ContentPlaceHolder1$cbMonthly", "ctl00_FormDecorator1_ClientState": "", "ctl00_RadToolTipManager1_ClientState": "", "ctl00_mainNav_ClientState": "", "ctl00$ContentPlaceHolder1$cbToday": "on", "ctl00$ContentPlaceHolder1$cbMonthly": "on", "ctl00_ContentPlaceHolder1_dgrdNotices_ClientState": "", "__ASYNCPOST": "true", "RadAJAXControlID": "ctl00_ContentPlaceHolder1_RadAjaxPanel1", } page = self.get(url).content page = lxml.html.fromstring(page) html = self.asp_post(url, page, params) page = lxml.html.fromstring(html) for row in page.xpath('//tr[contains(@id,"_dgrdNotices_")]'): status = "tentative" agenda_link = row.xpath('.//a[@id="hlMeetAgenda"]')[0] title = agenda_link.xpath("text()")[0].strip() agenda_url = agenda_link.xpath("@href")[0] location = row.xpath("td[3]")[0].text_content().strip() # swap in a space for the <br/> when = row.xpath("td[4]")[0] for br in when.xpath(".//br"): br.tail = " " + br.tail if br.tail else " " when = when.text_content().strip() if "cancelled" in when.lower(): status = "cancelled" when = re.sub("CANCELLED", "", when, re.IGNORECASE) when = self._tz.localize(dateutil.parser.parse(when)) event = Event( name=title, location_name=location, start_date=when, classification="committee-meeting", status=status, ) event.add_source(url) event.add_document("Agenda", agenda_url, media_type="application/pdf") yield event
def scrape_chamber(self, chamber): chamber_abbr = self.chamber_abbrs[chamber] event_url = f"http://billstatus.ls.state.ms.us/htms/{chamber_abbr}_sched.htm" text = self.get(event_url).text event = None when, time, room, com, desc = None, None, None, None, None for line in text.splitlines(): # new date if re.match( r"^(MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY)", line, re.IGNORECASE, ): day = line.split(" ")[0].strip() # timestamp, start of a new event if re.match(r"^\d{2}:\d{2}", line) or re.match(r"^(BC|AR|AA|TBA)\+", line): # if there's an event from the previous lines, yield it if when and room and com: event = Event( name=com, start_date=when, location_name=room, classification="committee-meeting", description=desc, ) event.add_source(event_url) yield event (time, room, com) = re.split(r"\s+", line, maxsplit=2) # if it's an after recess/adjourn # we can't calculate the time so just leave it empty if re.match(r"^(BC|AR|AA|TBA)\+", line): time = "" com = com.strip() when = dateutil.parser.parse(f"{day} {time}") when = self._tz.localize(when) # reset the description so we can populate it w/ # upcoming lines (if any) desc = "" elif when and room and com: if line.strip(): desc += "\n" + line.strip() # don't forget about the last event, which won't get triggered by a new date if when and room and com: event = Event( name=com, start_date=when, location_name=room, classification="committee-meeting", description=desc, ) event.add_source(event_url) yield event
def event_obj(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", location_name="Joe's Place", ) e.add_source(url="http://example.com/foobar") return e
def upper_parse_agenda_item(self, item): response = self.api_client.get( "meeting", year=item["agendaId"]["year"], agenda_id=item["agendaId"]["number"], committee=item["committeeId"]["name"], ) data = response["result"] chamber = data["committee"]["committeeId"]["chamber"].title() com_code = data["committee"]["committeeId"]["name"] com_name = f"{chamber} {com_code}" # each "meeting" is actually a listing page of multiple meetings of the same committee # broken out by different addendumId for addendum in data["committee"]["addenda"]["items"]: if addendum["addendumId"] != item["addendum"]: continue meeting = addendum["meeting"] when = dateutil.parser.parse(meeting["meetingDateTime"]) when = self._tz.localize(when) location = meeting["location"] description = meeting["notes"] if location == "": location = "See Committee Site" if "canceled" in description.lower(): continue event = Event( name=com_name, start_date=when, location_name=location, description=description, ) event.add_participant(com_name, type="committee", note="host") com_code = (com_code.lower().replace("'", "").replace(" ", "-").replace( ",", "")) url = f"https://www.nysenate.gov/committees/{com_code}" event.add_source(url) bills = addendum["bills"]["items"] if len(bills) > 0: agenda = event.add_agenda_item("Bills under consideration") for bill in bills: agenda.add_bill(bill["billId"]["printNo"]) yield event
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) # figuring out starting year from metadata for item in self.jurisdiction.legislative_sessions: if item["identifier"] == session: start_year = item["start_date"][:4] self.year = start_year break url = f"https://www.legis.nd.gov/assembly/{session}-{start_year}/committees/interim/committee-meeting-summary" page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[contains(@class,"views-table")]'): com = table.xpath("caption/a")[0].text_content().strip() for row in table.xpath("tbody/tr"): date_link = row.xpath("td[1]/strong/a")[0] event_url = date_link.xpath("@href")[0] date = date_link.xpath("span")[0].text_content().strip() date = dateutil.parser.parse(date) date = self._tz.localize(date) self.event_months.add(date.strftime("%Y-%m")) location = "See Agenda" event = Event(name=com, start_date=date, location_name=location) event.add_source(event_url) for link in row.xpath("td[2]//a"): link_text = link.text_content().strip() # skip live broadcast links if "video.legis" in link_text: continue event.add_document(link_text, link.xpath("@href")[0], media_type="application/pdf") self.events[event_url] = event for year_month in self.event_months: self.scrape_calendar(year_month) for key in self.events: yield self.events[key]
def scrape(self, start=None, end=None): if start is None: start = dt.datetime.today() else: start = dateutil.parser.parse(start) if end is None: end = start + relativedelta(months=+3) else: end = dateutil.parser.parse(end) start = start.strftime("%Y-%m-%d") end = end.strftime("%Y-%m-%d") url = f"{self.base_url}calendar-data?start={start}&end={end}" data = json.loads(self.scraper.get(url).content) for item in data: name = item["title"].strip() if "canceled" in name.lower(): continue if "house session" in name.lower( ) or "senate session" in name.lower(): continue url = f"{self.base_url}{item['url']}" when = dateutil.parser.parse(item["start"]) when = self._tz.localize(when) page = self.scraper.get(url).content page = lxml.html.fromstring(page) location = page.xpath( '//div[contains(@class,"eventModule") and h3[contains(text(), "Location")]]/text()' )[0].strip() agenda_url = page.xpath( '//a[contains(@class,"linkButton") and contains(text(),"Agenda")]/@href' )[0] event = Event( name=name, start_date=when, location_name=location, ) event.add_participant(name, type="committee", note="host") event.add_document("Agenda", agenda_url, media_type="application/pdf") event.add_source(url) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath(".//a[contains(@title,'Committee Details')]") if len(comit_url) != 1: continue comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib["href"]) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib["href"] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") if cttie: cttie = cttie.replace("Committee on", "").strip() cttie = f"{chamber} {cttie}" name = cttie event = Event( name=name, location_name=where, start_date=self._tz.localize(when) ) event.add_source(calurl) event.add_committee(cttie, note="host") event.add_document("notice", notice, media_type="application/pdf") for entry in what: item = event.add_agenda_item(entry) if entry.startswith("AB") or entry.startswith("SB"): item.add_bill(entry) for thing in who: event.add_person(thing["name"]) yield event
def scrape_chamber(self, chamber): url = utils.urls["events"][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath( '//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath( 'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath("tr"): time_string = row.xpath( 'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = ( row.xpath('td[@class="CMS-MeetingDetail-Agenda"]/div/div') [-1].text_content().strip()) location = (row.xpath('td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip()) committees = row.xpath( './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a' ) bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_date = datetime.datetime.strptime( "{} {}".format(date_string, time_string), "%m/%d/%Y %I:%M %p") except ValueError: break event = Event( name=description, start_date=self._tz.localize(start_date), location_name=location, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_bill("{}{} {}".format(qs["body"], qs["type"], qs["bn"])) for committee in committees: parsed = urllib.parse.urlparse(committee.get("href")) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r" \([S|H]\)$", "", committee.text), id=qs.get("Code"), ) yield event
def scrape_upper(self): listing_url = "https://www.senate.mo.gov/hearingsschedule/hrings.htm" html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split("<hr />")[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, "Date:") when_time = self.row_content(page, "Time:") location = self.row_content(page, "Room:") location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # com = self.row_content(page, 'Committee:') com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(", Senator")[0].strip() start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant(com, type="committee", note="host") for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = "" if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath("string(tr[2])").strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath("string(tr[1])").strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def scrape_cal_page(self, url): page = self.get(url).content page = lxml.html.fromstring(page) page.make_links_absolute(url) for row in page.xpath("//article[contains(@class,'accordion')]"): when = row.xpath(".//time/@datetime")[0] when = dateutil.parser.parse(when) title = row.xpath( ".//h3[contains(@class,'heading-link')]/text()")[0].strip() description = row.xpath( "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]" )[0].text_content() # fix special chars description = (description.replace("\n\u2013", " ").replace( "\n", " ").replace("\u203a", "")) description = description.replace("More about this event", "").strip() location = row.xpath( "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p" )[0].text_content() event = Event( name=title, description=description, start_date=when, location_name=location, ) agenda_url = row.xpath( ".//a[contains(text(),'More about this event')]/@href") if agenda_url != []: event.add_document("Details and Agenda", agenda_url[0], media_type="text/html") if "committee meeting" in title.lower(): com_name = title.replace("Committee Meeting", "").strip() event.add_participant(com_name, type="commitee", note="host") event.add_source(url) yield event if page.xpath("//a[contains(text(), 'Upcoming Events')]"): next_url = page.xpath( "//a[contains(text(), 'Upcoming Events')]/@href")[0] yield from self.scrape_cal_page(next_url)
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], "MISCELLANEOUS", ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = (self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description) date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = "More than one committee meeting at (location, date) %r" msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = "Committee Meeting: " + committee_name event = Event(name=desc, start_date=date, location_name=committee_name) for bill_id in bills: if "B" in bill_id: type_ = "bill" else: type_ = "resolution" item = event.add_agenda_item("consideration") item.add_bill(bill_id, note=type_) event.add_person(committee_name + " Committee", note="host") event.add_source("https://downloads.leginfo.legislature.ca.gov/") yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, "//wa:CommitteeMeeting"): event_cancelled = xpath(row, "string(wa:Cancelled)") if event_cancelled == "true": continue event_chamber = xpath(row, "string(wa:Agency)") if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S" ) event_date = self._tz.localize(event_date) event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)") agenda_id = xpath(row, "string(wa:AgendaId)") notes = xpath(row, "string(wa:Notes)") room = xpath(row, "string(wa:Room)") building = xpath(row, "string(wa:Building)") # XML has a wa:Address but it seems useless city = xpath(row, "string(wa:City)") state = xpath(row, "string(wa:State)") location = "{}, {}, {} {}".format(room, building, city, state) event = Event( name=event_com, start_date=event_date, location_name=location, description=notes, ) source_url = ( "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format( agenda_id ) ) event.add_source(source_url) event.add_participant(event_com, type="committee", note="host") event.extras["agendaId"] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event(start_date=start_date, end_date=end_date, name=title, location_name=location) event.add_source( "http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx") for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath("@href")[0], media_type="application/pdf", on_duplicate="ignore", ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath("string(.)").strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link("Video of Hearing", video[0].xpath("@href")[0], "text/html") if "subcommittee" in title.lower(): subcom = title.split("-")[0].strip() event.add_participant(subcom, type="committee", note="host") else: event.add_participant(com, type="committee", note="host") yield event
def scrape(self, session=None, chamber=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO(page.text), delimiter="|") for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += next(page) desc = row[7].strip() match = re.match(r"^(.*)- (HOUSE|SENATE)$", desc) if match: comm = match.group(1).strip() comm = re.sub(r"\s+", " ", comm) location = row[5].strip() or "Unknown" when = datetime.datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S") when = self._tz.localize(when) # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue description = "%s MEETING" % comm event = Event( name=description, start_date=when, location_name=location, description=description, ) event.add_source(url) event.add_participant(comm, type="committee", note="host") # time = row[3].strip() # if time in _TIMECODES: # event['notes'] = TIMECODES[time] yield event
def scrape_upper(self, session_id): list_url = f"https://lis.virginia.gov/cgi-bin/legp604.exe?{session_id}+oth+MTG&{session_id}+oth+MTG" page = self.get(list_url).content page = lxml.html.fromstring(page) page.make_links_absolute(list_url) date = None # note the [td] at the end, they have some empty tr-s so skip them for row in page.xpath("//div[@id='mainC']/center/table/tr[td]"): if row.xpath("td[1]/text()")[0].strip() != "": date = row.xpath("td[1]/text()")[0].strip() description = row.xpath("td[3]/text()")[0].strip() # data on the house page is better if "senate" not in description.lower(): continue time = row.xpath("td[2]/text()")[0].strip() status = "tentative" if "CANCELLED" in time.lower(): status = "cancelled" try: when = dateutil.parser.parse(f"{date} {time}") except dateutil.parser._parser.ParserError: when = dateutil.parser.parse(date) when = self._tz.localize(when) # TODO: Post covid figure out how they post locations if "virtual" in description.lower(): location = "Virtual" else: location = "Unknown" event = Event( name=description, start_date=when, classification="committee-meeting", location_name=location, status=status, ) event.add_source(list_url) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) # Keep record of all events records = self.access_to_csv("Agendas") for record in records: if record["Status"] != "Scheduled": continue description = record["Comments"] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record["Date"], record["Time"]) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record["CommHouse"]] except KeyError: self.warning("unknown committee code %s, skipping", record["CommHouse"]) description = "Meeting of the {}".format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record["Location"] or "Statehouse", ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill["bill_id"]) # Add committee to event event.add_committee(hr_name, id=record["CommHouse"], note="host") event.add_source("http://www.njleg.state.nj.us/downloads.asp") yield event