def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore" ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link( 'Video of Hearing', video[0].xpath('@href')[0], 'text/html' ) if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def scrape_event_page(self, url, event_type): page = self.lxmlize(url) page.make_links_absolute('https://malegislature.gov/') title = page.xpath('string(//div[contains(@class,"followable")]/h1)') title = title.replace('Hearing Details', '').strip() title = title.replace('Special Event Details', '') start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip() start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip() location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip() description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip() start_date = self._TZ.localize( dateutil.parser.parse( '{} {}'.format(start_day, start_time), ) ) event = Event( start_date=start_date, name=title, location_name=location, description=description ) event.add_source(url) agenda_rows = page.xpath( '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]' '/div/div/div[contains(@class,"panel-default")]') for row in agenda_rows: # only select the text node, not the spans agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip() if agenda_title == '': agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip() agenda = event.add_agenda_item(description=agenda_title) bills = row.xpath('.//tbody/tr/td[1]/a/text()') for bill in bills: bill = bill.strip().replace('.', ' ') agenda.add_bill(bill) if event_type == 'Hearing': event.add_participant( title, type='committee', note='host', ) yield event
def scrape(self): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" # when is now of the following format: # Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath(".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") name = description_lines[0].text_content() desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+', " ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event(name=name, description=description, start_date=self._tz.localize(when), location_name=location, classification=event_type, ) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: continue bill = bill[0] bill_desc = b.text_content().replace(bill, "").strip(", ").strip() ses, num = bill.split("-") bill = ses.replace(" ", "") + "-" + num.zfill(4) item = e.add_agenda_item(bill_desc) item.add_bill(bill) e.add_source(calendar_url) if committee: e.add_participant(committee, type='organization', note='host') yield e
def scrape_upper(self): listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm' html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split('<hr />')[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, 'Date:') when_time = self.row_content(page, 'Time:') location = self.row_content(page, 'Room:') location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # com = self.row_content(page, 'Committee:') com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0] com = com.split(', Senator')[0].strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source(listing_url) event.add_participant( com, type='committee', note='host', ) for bill_table in page.xpath('//table[@width="85%" and @border="0"]'): bill_link = '' if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath('string(tr[2])').strip() agenda_item = event.add_agenda_item(description=agenda_line) bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath('string(tr[1])').strip() agenda_item = event.add_agenda_item(description=agenda_line) yield event
def scrape(self): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue name = " ".join( x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip() ) # Skip events with no name if not name: continue event = Event( start_date=self._TZ.localize( datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT, ) ), name=name, location_name=doc.xpath( '//div[@class="heading-container"]/span/text()' )[0].title() ) event.add_participant( info.xpath('span[@class="col01"]/text()')[0].title(), type='committee', note='host', ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) yield event
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize(datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath('text()')[0], url=url, media_type='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant(ctty_name, 'organization') bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath('string(Sponsor)').strip() committee_name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]['name'] ) name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], row.xpath('string(Title)').strip() ) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == '': name = committee_name location = row.xpath('string(Location)').strip() # events with no location all seem to be committee hearings if location == '': location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801' start_date = dateutil.parser.parse(row.xpath('string(Schedule)')) # todo: do i need to self._TZ.localize() ? event = Event( start_date=start_date, name=name, location_name=location ) event.add_source('http://w3.akleg.gov/index.php#tab4') event.add_participant( committee_name, type='committee', note='host', ) for item in row.xpath('Agenda/Item'): agenda_desc = item.xpath('string(Text)').strip() if agenda_desc != '': agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath('BillRoot'): bill_id = item.xpath('string(BillRoot)') # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r'\s+', ' ', bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, 'Committee:') when_date = self.table_row_content(page, 'Date:') when_time = self.table_row_content(page, 'Time:') location = self.table_row_content(page, 'Location:') if 'house hearing room' in location.lower(): location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # fix some broken times, e.g. '12 :00' when_time = when_time.replace(' :', ':') # some times have extra info after the AM/PM if 'upon' in when_time: when_time = when_time.split('AM', 1)[0] when_time = when_time.split('PM', 1)[0] start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx') event.add_participant( com, type='committee', note='host', ) # different from general MO link xpath due to the <b> house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \ 'or contains(@href, "bill.aspx")]/b/text()' for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split('--')[0].strip() bill_no = bill_no.replace('HCS', '').strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], 'MISCELLANEOUS', ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub(r"\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize( dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant(ctty_name, 'organization') bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def scrape_events(self, chamber, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") # some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text # type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) if where is None or where == "": where = 'State House' event = Event(name=description, start_date=when, location_name=where) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant(name, type='committee', note='host') event.add_source(url) yield event else: # hack so we dont fail on the first id numbers where there are some gaps # between the numbers that work and not. if event_id > 1700: raise Exception( "Parsing is done we are on future ids that are not used yet." )
def scrape(self, session=None, chamber=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO(page.text), delimiter='|') for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += next(page) desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') when = self._tz.localize(when) # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue description = "%s MEETING" % comm event = Event( name=description, start_time=when, location_name=location, description=description, timezone=self._tz.zone ) event.add_source(url) event.add_participant(comm, type='committee', note='host') # time = row[3].strip() # if time in TIMECODES: # event['notes'] = TIMECODES[time] yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, '//wa:CommitteeMeeting'): event_cancelled = xpath(row, 'string(wa:Cancelled)') if event_cancelled == 'true': continue event_chamber = xpath(row, 'string(wa:Agency)') if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, 'string(wa:Date)'), "%Y-%m-%dT%H:%M:%S") event_date = self._tz.localize(event_date) event_com = xpath(row, 'string(wa:Committees/' 'wa:Committee/wa:LongName)') agenda_id = xpath(row, 'string(wa:AgendaId)') notes = xpath(row, 'string(wa:Notes)') room = xpath(row, 'string(wa:Room)') building = xpath(row, 'string(wa:Building)') # XML has a wa:Address but it seems useless city = xpath(row, 'string(wa:City)') state = xpath(row, 'string(wa:State)') location = '{}, {}, {} {}'.format( room, building, city, state ) event = Event(name=event_com, start_date=event_date, location_name=location, description=notes) source_url = 'https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}'.format( agenda_id) event.add_source(source_url) event.add_participant(event_com, type='committee', note='host') event.extras['agendaId'] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub(r"\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo['COMMITTEE'] where = metainfo['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo['PLACE'] = where.strip() metainfo['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo['CHAIR'] plaintext = re.sub(r"\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type='committee', note='host') if chair is not None: event.add_participant(chair, type='legislator', note='chair') for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo['COMMITTEE'] where = metainfo['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo['PLACE'] = where.strip() metainfo['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type='committee', note='host') if chair is not None: event.add_participant(chair, type='legislator', note='chair') for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, "//wa:CommitteeMeeting"): event_cancelled = xpath(row, "string(wa:Cancelled)") if event_cancelled == "true": continue event_chamber = xpath(row, "string(wa:Agency)") if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S" ) event_date = self._tz.localize(event_date) event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)") agenda_id = xpath(row, "string(wa:AgendaId)") notes = xpath(row, "string(wa:Notes)") room = xpath(row, "string(wa:Room)") building = xpath(row, "string(wa:Building)") # XML has a wa:Address but it seems useless city = xpath(row, "string(wa:City)") state = xpath(row, "string(wa:State)") location = "{}, {}, {} {}".format(room, building, city, state) event = Event( name=event_com, start_date=event_date, location_name=location, description=notes, ) source_url = "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format( agenda_id ) event.add_source(source_url) event.add_participant(event_com, type="committee", note="host") event.extras["agendaId"] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def scrape_events(self, chamber, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") # some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text # type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) if where is None or where == "": where = 'State House' event = Event(name=description, start_date=when, location_name=where) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant(name, type='committee', note='host') event.add_source(url) yield event else: # hack so we dont fail on the first id numbers where there are some gaps # between the numbers that work and not. if event_id > 1700: raise Exception("Parsing is done we are on future ids that are not used yet.")
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, 'Committee:') when_date = self.table_row_content(page, 'Date:') when_time = self.table_row_content(page, 'Time:') location = self.table_row_content(page, 'Location:') if 'house hearing room' in location.lower(): location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101') # fix some broken times, e.g. '12 :00' when_time = when_time.replace(' :', ':') # some times have extra info after the AM/PM if 'upon' in when_time: when_time = when_time.split('AM', 1)[0] when_time = when_time.split('PM', 1)[0] start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx') event.add_participant( com, type='committee', note='host', ) # different from general MO link xpath due to the <b> house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \ 'or contains(@href, "bill.aspx")]/b/text()' for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split('--')[0].strip() bill_no = bill_no.replace('HCS', '').strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_chamber(self, chamber, session): cha = {"upper": "7", "lower": "3", "other": "4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now+timedelta(days=30)).strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading, content = meeting.xpath("./ul/li") who, when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") location = (where or '').strip() or "unknown" event = Event(name=meeting_title, start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=location, description=meeting_title) event.add_participant(who.strip(), type='committee', note='host') event.add_source(url) # only scraping public hearing bills for now. bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li" "[contains(@class, 'visible-lg')]") for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") item = event.add_agenda_item(descr.strip()) item.add_bill(bill_id.strip()) yield event
def scrape(self): for event in self.events(): e = Event(name=event["EventBodyName"], start_time=event["start"], timezone=self.TIMEZONE, description='', location_name=event["EventLocation"], status=event["status"]) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) e.add_participant(name=event["EventBodyName"], type="organization") e.add_source('foo') meeting_detail_web = self.WEB_URL + '/MeetingDetail.aspx?ID={EventId}&GUID={EventGuid}'.format(**event) if requests.head(meeting_detail_web).status_code == 200: e.add_source(meeting_detail_web, note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") yield e
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, "Committee:") when_date = self.table_row_content(page, "Date:") when_time = self.table_row_content(page, "Time:") location = self.table_row_content(page, "Location:") if "house hearing room" in location.lower(): location = "{}, {}".format( location, "201 W Capitol Ave, Jefferson City, MO 65101") # fix some broken times, e.g. '12 :00' when_time = when_time.replace(" :", ":") # some times have extra info after the AM/PM if "upon" in when_time: when_time = when_time.split("AM", 1)[0] when_time = when_time.split("PM", 1)[0] start_date = self._TZ.localize( dateutil.parser.parse("{} {}".format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx") event.add_participant(com, type="committee", note="host") # different from general MO link xpath due to the <b> house_link_xpath = ('.//a[contains(@href, "Bill.aspx") ' 'or contains(@href, "bill.aspx")]/b/text()') for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split("--")[0].strip() bill_no = bill_no.replace("HCS", "").strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): for event, agenda in self.events() : description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = confirmedOrPassed(when) elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = confirmedOrPassed(when) else : status = confirmedOrPassed(when) if description : e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") if agenda : e.add_source(event['Meeting Details']['url'], note='web') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"] : identifier = item["Record #"]['label'] if identifier.startswith('S'): identifier = identifier[1:] agenda_item.add_bill(identifier) else : e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day) org_name = event['meeting'] e = Event( name = org_name, start_time = start, timezone = tz.zone, location_name = event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name = org_name, type = 'organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') wards = normalize_wards(item['wards']) identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session = '2014-2018', identifier = full_identifier, title = item['title'], from_organization = {'name': self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier)) b.extras = { 'wards': wards, } self.seen_agenda_items.append(full_identifier) yield b yield e
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": json.dumps({'office_name': parsed_form['office_name'], 'restriction_period': parsed_form['restriction_period'], 'name': parsed_form['name']}, sort_keys=True) } _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['restriction_period']['restriction_period_begin_date'], '%Y-%m-%d').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['restriction_period']['restriction_period_begin_date'], '%Y-%m-%d').replace(tzinfo=UTC), classification="post_employment", ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.extras['office_name'] = parsed_form["office_name"] registrant_name = ' '.join([s for s in [parsed_form['name']['name_first'], parsed_form['name']['name_middle'], parsed_form['name']['name_last']] if s is not None]) _registrant = Person( name=registrant_name, source_identified=True ) _office = Organization( name=parsed_form['office_name'], classification='office', parent_id=self.jurisdiction._senate, source_identified=True ) _office.add_member( _registrant, role='employee', label='employee for {n}'.format(n=_office.name), end_date=parsed_form['restriction_period']['restriction_period_begin_date'], ) _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _post_employment_event = Event( name="{rn} - {rt}, {o} (via {a})".format(rn=_registrant.name, rt="post-employment period", o=_office.name, a="Senate Office of Public Record"), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['restriction_period']['restriction_period_begin_date'], '%Y-%m-%d').replace(tzinfo=UTC), end_time=datetime.strptime( parsed_form['restriction_period']['restriction_period_end_date'], '%Y-%m-%d').replace(tzinfo=UTC), classification='post_employment' ) _post_employment_event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _post_employment_event.extras['office_name'] = parsed_form["office_name"] _disclosure.add_disclosed_event( name=_post_employment_event.name, type=_post_employment_event._type, classification=_post_employment_event.classification, id=_post_employment_event._id ) _post_employment_event.add_source(**_source) yield _post_employment_event _office.add_source(**_source) yield _office _registrant.add_source(**_source) yield _registrant _disclosure.add_source(**_source) yield _disclosure
def scrape_event_page(self, url, chamber): html = self.get(url).text page = lxml.html.fromstring(html) trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr") metainf = {} for tr in trs: tds = tr.xpath(".//td") if len(tds) <= 1: continue key = tds[0].text_content().strip() val = tds[1] metainf[key] = { "txt": val.text_content().strip(), "obj": val } if metainf == {}: return # Wednesday, 5/16/2012 3:00 pm datetime = "%s %s" % ( metainf['Date']['txt'], metainf['Time']['txt'].replace(".", "") ) if "Cancelled" in datetime: return translate = { "noon": " PM", "a.m.": " AM", "am": " AM", # This is due to a nasty line they had. "a.m": "AM" # another weird one } for t in translate: if t in datetime: datetime = datetime.replace(t, translate[t]) datetime = re.sub(r"\s+", " ", datetime) for text_to_remove in [ "or after committees are given leave", "or later immediately after committees are given leave", "or later after committees are given leave by the House to meet", "**Please note time**"]: datetime = datetime.split(text_to_remove)[0].strip() datetime = datetime.replace('p.m.', 'pm') datetime = datetime.replace('Noon', "pm") try: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p") except ValueError: datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p") where = metainf['Location']['txt'] title = metainf['Committee']['txt'] # XXX: Find a better title if chamber == 'other': chamber = 'joint' event = Event( name=title, start_date=self._tz.localize(datetime), location_name=where, ) event.add_source(url) event.add_source(mi_events) chair_name = metainf['Chair']['txt'].strip() if chair_name: event.add_participant(chair_name, type='legislator', note='chair') else: self.warning("No chair found for event '{}'".format(title)) event.add_participant(metainf['Committee']['txt'], type='committee', note='host') agenda = metainf['Agenda']['obj'] agendas = agenda.text_content().split("\r") related_bills = agenda.xpath("//a[contains(@href, 'getObject')]") for bill in related_bills: description = agenda for a in agendas: if bill.text_content() in a: description = a item = event.add_agenda_item(description) item.add_bill(bill.text_content()) yield event
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + datetime.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = datetime.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event['meeting'] e = Event( name=org_name, start_time=start, timezone=tz.zone, location_name=event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_source(calendar_day_url) e.add_participant( name=org_name, type='organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = agenda_url_template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) yield e
def scrape_chamber(self, chamber, session): today = datetime.date.today() start_date = today - datetime.timedelta(days=10) end_date = today + datetime.timedelta(days=10) if chamber == 'upper': chamber_abbrev = 'S' else: chamber_abbrev = 'H' url = ("http://www.legis.iowa.gov/committees/meetings/meetingsList" "Chamber?chamber=%s&bDate=%02d/%02d/" "%d&eDate=%02d/%02d/%d" % (chamber_abbrev, start_date.month, start_date.day, start_date.year, end_date.month, end_date.day, end_date.year)) page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath("//div[contains(@class, 'meetings')]/table[1]/" "tbody/tr[not(contains(@class, 'hidden'))]"): comm = link.xpath("string(./td[2]/a[1]/text())").strip() desc = comm + " Committee Hearing" location = link.xpath("string(./td[3]/text())").strip() when = link.xpath("string(./td[1]/span[1]/text())").strip() if 'cancelled' in when.lower() or "upon" in when.lower(): continue if "To Be Determined" in when: continue if 'AM' in when: when = when.split('AM')[0] + " AM" else: when = when.split('PM')[0] + " PM" junk = ['Reception'] for key in junk: when = when.replace(key, '') when = re.sub("\s+", " ", when).strip() if "tbd" in when.lower(): # OK. This is a partial date of some sort. when = datetime.datetime.strptime( when, "%m/%d/%Y TIME - TBD %p" ) else: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p") except ValueError: try: when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p") except ValueError: self.warning('error parsing timestamp %s', when) continue event = Event( name=desc, description=desc, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(comm, note='host', type='committee') yield event
def scrape_chamber(self, chamber): """ Scrape upper or lower committee agendas """ # session = self.latest_session() # since we are scraping only latest_session # session_id = self.session_metadata.session_id_meta_data[session] # could use &ShowAll=ON doesn't seem to work though url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % self._chamber_short[chamber] html_ = self.get(url).text doc = html.fromstring(html_) if chamber == 'upper': event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/' 'tr/td/table/tr/td/table')[0] else: event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr' '/td/table/tr/td/table/tr/td/table')[0] for row in event_table.xpath('tr')[2:]: # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room, # HTML Document, PDF Document for house # Agenda Date, Committee, Revised, Cancelled, Time, Room, # HTML Document, PDF Document for senate text = [x.text_content().strip() for x in row.xpath('td')] when, committee = text[0:2] if chamber == 'upper': time, room = text[4:6] link = row[6].xpath('string(a/@href)') else: time, room = text[5:7] link = row[7].xpath('string(a/@href)') if 'NOT MEETING' in time or 'CANCELLED' in time: continue time = re.match(r'(\d+:\d+ (A|P))', time) if time: when = "%s %sM" % (text[0], time.group(0)) when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p') else: when = text[0] when = datetime.datetime.strptime(when, '%m/%d/%Y') title = "Committee Meeting:\n%s %s %s\n" % ( self._chamber_long[chamber], committee, room) agenda_info = self.parse_agenda(chamber, link) description = agenda_info['description'] member_list = agenda_info['member_list'] related_bills = agenda_info['related_bills'] print(related_bills) """ event = Event(session, when, 'committee:meeting', title, location=room, link=link, details=description, related_bills=related_bills) """ event = Event(location_name=room, start_date=self._tz.localize(when), name=title, description=description, ) event.add_participant(committee, type='committee', note='host') event.participants.extend(member_list) event.add_source(url) event.add_source(link) # print event['when'].timetuple() # import ipdb;ipdb.set_trace() yield event
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape_meeting(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//a[@id='linkTitle']//text()")[0] date = page.xpath("//span[@id='lDate']/text()")[0] time = page.xpath("//span[@id='lTime']/text()")[0] location = page.xpath("//span[@id='lLocation']/text()")[0] substs = { "AM": ["A.M.", "a.m."], "PM": ["P.M.", "p.m.", "Noon"], } for key, values in substs.items(): for value in values: time = time.replace(value, key) # Make sure there's a space between the time's minutes and its AM/PM if re.search(r'(?i)\d[AP]M$', time): time = time[:-2] + " " + time[-2:] if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()): all_day = True when = datetime.datetime.strptime(date, "%B %d, %Y") else: all_day = False when = datetime.datetime.strptime("%s %s" % ( date, time ), "%B %d, %Y %I:%M %p") # when = self._tz.localize(when) description = "Meeting on %s of the %s" % (date, title) chambers = {"house": "lower", "senate": "upper", "joint": "legislature"} for chamber_ in chambers.keys(): if chamber_ in title.lower(): break else: return event = Event(name=description, start_date=self._tz.localize(when), location_name=location, all_day=all_day) event.add_source(url) event.add_participant(title, note='host', type='committee') trs = iter(page.xpath("//tr[@valign='top']")) next(trs) for tr in trs: try: _, _, bill, whom, descr = tr.xpath("./td") except ValueError: continue bill_title = bill.text_content() if "S" in bill_title or "H" in bill_title: item = event.add_agenda_item(descr.text_content()) item.add_bill(bill_title) else: continue yield event
def scrape_agenda(self, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] # check for duration in time if ' - ' in time: start, end = time.split(' - ') am_pm_srch = re.search('(?i)(am|pm)', end) if am_pm_srch: time = ' '.join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] event_desc = "Meeting Notice" if 'Rise' in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper(): return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event( name=event_desc, start_date=self._tz.localize(datetime), location_name=where, ) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document( bill.text_content(), bill_ft, media_type="application/pdf") root = bill.xpath('../../*') root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext().text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill.text_content()) committee = page.xpath("//span[@id='lblSession']")[0].text_content() event.add_participant(committee, 'committee', note='host') yield event
def scrape(self): method = 'events/?state={}&dtstart=1776-07-04'.format(self.state) self.events = self.api(method) seen = set() for event in self.events: begin = self._date_parse(event.pop('when')) end = self._date_parse(event.pop('end')) all_day = event.pop('all_day',False) e = Event(name=event.pop('description'), classification=event.pop('type'), location_name=event.pop('location'), timezone=event.pop('timezone'), start_time=begin, end_time=end, all_day=all_day,) if len(e.name) >= 300: e.name = e.name[:290] if len(e.location['name']) >= 100: e.location['name'] = e.location['name'][:90] composite_key = (e.name, e.description, e.start_time) if composite_key in seen: print("Duplicate found: %s/%s/%s" % (composite_key)) continue seen.add(composite_key) for source in event.pop('sources'): if 'retrieved' in source: source.pop('retrieved') e.add_source(**source) if e.sources == []: continue ignore = ['country', 'level', 'state', 'created_at', 'updated_at', 'notes', '+location_url', 'session', 'id', '+chamber', '+agenda', '+cancelled', '+media_contact', '+contact', '+details'] # +agenda: # Agenda on old (very old) OpenStates data is actually a string # and not any sort of structured data we can use in the items # schema, and is only present for a handful of events. for i in ignore: if i in event: event.pop(i) for link in ['+link', 'link']: if link in event: e.add_source(url=event.pop(link)) for p in event.pop('participants', []): type_ = { "committee": "organization", "legislator": "person", None: None, }[p.get('participant_type')] if type_ is None: # Garbage data. continue e.add_participant(name=p['participant'], note=p['type'], type=type_,) for b in event.pop('related_bills', []): item = e.add_agenda_item( b.pop('description', b.pop('+description', None))) item.add_bill(bill=b['bill_id'], note=b.pop('type', b.pop('+type', None))) seen_documents = set([]) for document in event.pop('documents', []): if document['url'] in seen_documents: print("XXX: Buggy data in: Duped Document URL: %s (%s)" % ( document['url'], document['name'] )) continue seen_documents.add(document['url']) e.add_document(url=document['url'], note=document['name']) assert event == {}, "Unknown fields: %s" % ( ", ".join(event.keys()) ) yield e
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": json.dumps({'office_name': parsed_form['office_name'], 'termination_date': parsed_form['termination_date'], 'lobbying_eligibility_date': parsed_form['lobbying_eligibility_date'], 'name': parsed_form['employee_name']}, sort_keys=True) } _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['termination_date'], '%Y-%m-%d').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['termination_date'], '%Y-%m-%d').replace(tzinfo=UTC), classification="post_employment", ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.extras['office_name'] = parsed_form["office_name"] _registrant = Person( name=parsed_form['employee_name'], source_identified=True ) _office = Organization( name=parsed_form['office_name'], classification='office', parent_id=self.jurisdiction._house, source_identified=True ) _office.add_member( _registrant, role='employee', label='employee for {n}'.format(n=_office.name), end_date=parsed_form['termination_date'], ) _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _post_employment_event = Event( name="{rn} - {rt}, {o} (via {a})".format(rn=_registrant.name, rt="post-employment period", o=_office.name, a="House Clerk"), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['termination_date'], '%Y-%m-%d').replace(tzinfo=UTC), end_time=datetime.strptime( parsed_form['lobbying_eligibility_date'], '%Y-%m-%d').replace(tzinfo=UTC), classification='post_employment' ) _post_employment_event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _post_employment_event.extras['office_name'] = parsed_form["office_name"] _disclosure.add_disclosed_event( name=_post_employment_event.name, type=_post_employment_event._type, classification=_post_employment_event.classification, id=_post_employment_event._id ) _post_employment_event.add_source(**_source) yield _post_employment_event _office.add_source(**_source) yield _office _registrant.add_source(**_source) yield _registrant _disclosure.add_source(**_source) yield _disclosure