def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore" ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link( 'Video of Hearing', video[0].xpath('@href')[0], 'text/html' ) if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search('\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source( 'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document(description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore") for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link('Video of Hearing', video[0].xpath('@href')[0], 'text/html') if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape(self, chamber=None): URL = "http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas" doc = self.lxmlize(URL) events = doc.xpath("//item") for info in events: title_and_date = info.xpath("title/text()")[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event( name=title, start_date=self._tz.localize( datetime.datetime.strptime(when, "%b %d, %Y")), location_name="State Capitol", ) event.add_source(URL) url = re.search(r"(http://.*?)\s", info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type="committee", note="host") documents = doc.xpath(".//td") for document in documents: url = re.search(r"(http://.*?pdf)", document.xpath("@onclick")[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath("text()")[0], url=url, media_type="application/pdf", ) bills = document.xpath("@onclick") for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item("Bill up for discussion") item.add_bill(bill_name) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", { "chamber": "unknown", "name": committee }) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill(bill['bill_id'], note=bill['type']) yield event
def scrape(self): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue name = " ".join( x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip() ) # Skip events with no name if not name: continue event = Event( start_date=self._TZ.localize( datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT, ) ), name=name, location_name=doc.xpath( '//div[@class="heading-container"]/span/text()' )[0].title() ) event.add_participant( info.xpath('span[@class="col01"]/text()')[0].title(), type='committee', note='host', ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize(datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath('text()')[0], url=url, media_type='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", {"chamber": "unknown", "name": committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill( bill['bill_id'], note=bill['type'] ) yield event
def scrape(self): for event in self.events(): e = Event(name=event["EventBodyName"], start_time=event["start"], timezone=self.TIMEZONE, description='', location_name=event["EventLocation"], status=event["status"]) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) e.add_participant(name=event["EventBodyName"], type="organization") e.add_source('foo') meeting_detail_web = self.WEB_URL + '/MeetingDetail.aspx?ID={EventId}&GUID={EventGuid}'.format(**event) if requests.head(meeting_detail_web).status_code == 200: e.add_source(meeting_detail_web, note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") yield e
def scrape_event(self, row): date_td = row.xpath('td[1]')[0] info_td = row.xpath('td[2]')[0] date = date_td.xpath('b')[0].text.strip() time = date_td.xpath('b/following-sibling::text()')[0].strip() date_and_time = "{} {}".format(date, time) start_date = datetime.datetime.strptime( date_and_time, '%m/%d/%y %I:%M %p') title = info_td.xpath('font[1]/strong')[0].text.strip() all_text = info_td.xpath('descendant-or-self::*/text()') notes = (line.strip() for line in all_text if line.strip()) notes = list(notes) # Skip the first line, which is the title notes = notes[1:] # Split out the address address = notes[0] notes = notes[1:] # The rest just becomes the description notes = "\n".join(notes) event = Event( start_date=self._TZ.localize(start_date), name=title, location_name=address, description=notes ) event.add_source(self.URL) if info_td.xpath('a[contains(font/text(),"agenda")]'): agenda_url = info_td.xpath('a/@href')[0] event.add_document( "Agenda", url=agenda_url ) yield event
def scrape(self, chamber=None, session=None): """ Scrape the events data from all dates from the sc meetings page, then create and yield the events objects from the data. :param chamber: :param session: :return: yielded Event objects """ chambers = { 'upper': { 'name': 'Senate', 'title': 'Senator' }, 'lower': { 'name': 'House', 'title': 'Representative' }, } if chamber == 'other': return if chamber is None: self.info( 'no chamber specified, using Joint Committee Meeting Schedule') events_url = 'http://www.scstatehouse.gov/meetings.php' else: events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % ( chambers[chamber]['name'].upper()[0]) page = self.get_page_from_url(events_url) meeting_year = page.xpath( '//h2[@class="barheader"]/span')[0].text_content() meeting_year = re.search( r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})', meeting_year).group(1) dates = page.xpath("//div[@id='contentsection']/ul") for date in dates: date_string = date.xpath('span') if len(date_string) == 1: date_string = date_string[0].text_content() else: continue # If a event is in the next calendar year, the date_string # will have a year in it if date_string.count(",") == 2: event_year = date_string[-4:] date_string = date_string[:-6] elif date_string.count(",") == 1: event_year = meeting_year else: raise AssertionError("This is not a valid date: '{}'"). \ format(date_string) for meeting in date.xpath('li'): time_string = meeting.xpath('span')[0].text_content() if time_string == 'CANCELED' or len( meeting.xpath( './/span[contains(text(), "CANCELED")]')) > 0: continue time_string = normalize_time(time_string) date_time = datetime.datetime.strptime( event_year + ' ' + date_string + ' ' + time_string, "%Y %A, %B %d %I:%M %p") date_time = self._tz.localize(date_time) meeting_info = meeting.xpath( 'br[1]/preceding-sibling::node()')[1] location, description = re.search(r'-- (.*?) -- (.*)', meeting_info).groups() # if re.search(r'committee', description, re.I): # meeting_type = 'committee:meeting' # else: # meeting_type = 'other:meeting' event = Event( name=description, # Event Name start_time=date_time, # When the event will take place timezone=self._tz.zone, # the local timezone for the event location_name=location) # Where the event will be event.add_source(events_url) agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]") if agenda_url: agenda_url = agenda_url[0].attrib['href'] event.add_source(agenda_url) event.add_document(note="Agenda", url=agenda_url, media_type="application/pdf") agenda_page = self.get_page_from_url(agenda_url) for bill in agenda_page.xpath( ".//a[contains(@href,'billsearch.php')]"): # bill_url = bill.attrib['href'] bill_id = bill.text_content().replace('.', '').replace( ' ', '') # bill_description = self.get_bill_description(bill_url) event.add_bill(bill_id) yield event
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(since_datetime=n_days_ago) service_councils = set(sc['BodyId'] for sc in self.search( '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75')) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] elif event['EventBodyId'] in service_councils: # Don't scrape service council or service council public hearing events. self.info('Skipping event {0} for {1}'.format( event['EventId'], event['EventBodyName'])) continue else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if web_event.has_ecomment: self.info('Adding eComment link {0} from {1}'.format( web_event['eComment'], web_event['Meeting Details']['url'])) e.extras['ecomment'] = web_event['eComment'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes agenda_number = item["EventItemAgendaNumber"] note = "Agenda number, {}".format(agenda_number) agenda_item['notes'].append(note) agenda_item['extras']['agenda_number'] = agenda_number # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document( note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventAgendaLastPublishedUTC']).date()) if event['EventMinutesFile']: e.add_document( note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventMinutesLastPublishedUTC']).date()) elif web_event['Published minutes'] != 'Not\xa0available': e.add_document(note=web_event['Published minutes']['label'], url=web_event['Published minutes']['url'], media_type="application/pdf") else: approved_minutes = self.find_approved_minutes(event) if approved_minutes: e.add_document( note=approved_minutes['MatterAttachmentName'], url=approved_minutes['MatterAttachmentHyperlink'], media_type="application/pdf", date=self.to_utc_timestamp( approved_minutes['MatterAttachmentLastModifiedUtc'] ).date()) for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue # Sometimes if there is an issue getting the Spanish # audio created, Metro has the Spanish Audio link # go to the English Audio. # # Pupa does not allow the for duplicate media links, # so we'll ignore the the second media link if it's # the same as the first media link. # # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html', on_duplicate='ignore') if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None for event, web_event in self.events(n_days_ago): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links e.extras = {'guid': event['EventGuid']} for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': try: redirect_url = self.head( web_event['Audio']['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet contain the # location of the audio file. Skip these events, and retry # on next scrape. continue e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, session=None, start=None, end=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) # testimony url, we'll need it later in a loop # testmony query looks gnary but breaks down to: # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129) # $orderby: LastName,FirstName,Organization # $expand: Request # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization, # PresentedDate,FileSize,Topic testimony_url_base = ( "http://legislature.maine.gov/backend/" "breeze/data/CommitteeTestimony?" "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and" "%20(Request%2FLegislature%20eq%20{})" "&$orderby=LastName%2CFirstName%2COrganization&" "$expand=Request&$select=Id%2CFileType%2CNamePrefix" "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic" ) if start is None: start_date = datetime.datetime.now().isoformat() else: start_date = datetime.datetime.strptime(start, "%Y-%m-%d") start_date = start_date.isoformat() # default to 30 days if no end if end is None: dtdelta = datetime.timedelta(days=30) end_date = datetime.datetime.now() + dtdelta end_date = end_date.isoformat() else: end_date = datetime.datetime.strptime(end, "%Y-%m-%d") end_date = end_date.isoformat() bills_by_event = {} bills_url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsBills?startDate={}&endDate={}") bills_url = bills_url.format(start_date, end_date) page = json.loads(self.get(bills_url).content) for row in page: bills_by_event.setdefault(row["EventId"], []) bills_by_event[row["EventId"]].append(row) # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false url = ("http://legislature.maine.gov/backend/breeze/data/" "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true") url = url.format(start_date, end_date) page = json.loads(self.get(url).content) for row in page: if row["Cancelled"] is True or row["Postponed"] is True: continue start_date = self._TZ.localize( dateutil.parser.parse(row["FromDateTime"])) end_date = self._TZ.localize( dateutil.parser.parse(row["ToDateTime"])) name = row["CommitteeName"] if name is None: name = row["Host"] address = row["Location"] address = address.replace( "Cross Building", "Cross Office Building, 111 Sewall St, Augusta, ME 04330", ) address = address.replace( "State House", "Maine State House, 210 State St, Augusta, ME 04330") event = Event( start_date=start_date, end_date=end_date, name=name, location_name=address, ) event.add_source( "http://legislature.maine.gov/committee/#Committees/{}".format( row["CommitteeCode"])) if bills_by_event.get(row["Id"]): for bill in bills_by_event[row["Id"]]: description = "LD {}: {}".format(bill["LD"], bill["Title"]) agenda = event.add_agenda_item(description=description) agenda.add_bill("LD {}".format(bill["LD"])) if bill["TestimonyCount"] > 0: test_url = testimony_url_base.format( bill["PaperNumber"], session) test_page = json.loads(self.get(test_url).content) for test in test_page: title = "{} {} - {}".format( test["FirstName"], test["LastName"], test["Organization"], ) if test["NamePrefix"] is not None: title = "{} {}".format(test["NamePrefix"], title) test_url = ( "http://legislature.maine.gov/backend/app/services" "/getDocument.aspx?doctype=test&documentId={}". format(test["Id"])) if test["FileType"] == "pdf": media_type = "application/pdf" event.add_document(note=title, url=test_url, media_type=media_type) yield event
def scrape(self, start_time=None): if start_time is None: start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc) dupes = {} uniq = {} bad_ids = [] for i, hearing in enumerate(self.congressional_hearings(start_time)): package_id = hearing['packageId'] try: package_num, = re.findall('\d+$', package_id) except ValueError: bad_ids.append(package_id) continue # For appropriations hearings, the committees tend to # publish portions of the hearings as they are completed, # and then the final hearing are usually compiled, # printed, and added to the repository at the request of # the Committee. # # packages with 8 digits after hrg are the in-process # version # # There could be some time between the in-process and # final packages. Publication of hearings is the purview # of the committee. # # https://github.com/usgpo/api/issues/21#issuecomment-435926223 if len(package_num) == 8: continue mods_link = hearing['download']['modsLink'] response = self.get(mods_link) mods = xmltodict.parse(response.content) extension = collections.ChainMap(*mods['mods']['extension']) granule_class = extension.get('granuleClass', 'boo') if granule_class == 'ERRATA': continue meeting_type = self._meeting_type(extension) if meeting_type is None: continue held_date = extension['heldDate'] if type(held_date) is list: start_date = min(held_date) else: start_date = held_date event = Event(name=self._title(mods), start_date=start_date, classification=meeting_type, location_name='unknown') if not event.name: continue if 'number' in extension: hearing_number = '{docClass} {congress}-{number}'.format( **extension) print(hearing_number) event.extras['hearing_number'] = hearing_number for committee_d in self._unique(extension.get('congCommittee', [])): names = committee_d['name'] committee_name = self._name_type(names, 'authority-standard') if committee_name is None: committee_name = self._name_type(names, 'authority-short') if committee_d['@chamber'] == 'H': committee_name = 'House ' + committee_name elif committee_d['@chamber'] == 'S': committee_name = 'Senate ' + committee_name try: thomas_id = committee_d['@authorityId'].upper() except KeyError: thomas_id = None sub_committees = self._subcommittees(committee_d) if sub_committees: for sub_committee_d in sub_committees: sub_committee_name = sub_committee_d['name']['#text'] sub_committee_name = sub_committee_name.strip( string.punctuation) sub_committee_id = _make_pseudo_id( name=sub_committee_name, parent__identifiers__identifier=thomas_id) ret = { "name": sub_committee_name, "entity_type": 'organization', "note": 'host', "organization_id": sub_committee_id, } event.participants.append(ret) else: if thomas_id: ret = { "name": committee_name, "entity_type": 'organization', "note": 'host', "organization_id": _make_pseudo_id(identifiers__identifier=thomas_id) } event.participants.append(ret) else: event.add_committee(committee_name, note='host') links = mods['mods']['location']['url'] for link in self._unique(links): if link['@displayLabel'] == 'Content Detail': event.add_source(link['#text'], note='web') elif link['@displayLabel'] == 'HTML rendition': event.add_document('transcript', link['#text'], media_type='text/html') elif link['@displayLabel'] == 'PDF rendition': event.add_document('transcript', link['#text'], media_type='application/pdf') event.add_source(mods_link, note='API') self._unique_event(uniq, event, dupes) self._house_docs(uniq) for event in uniq.values(): yield event with open('bad_ids.txt', 'w') as f: for id in bad_ids: f.write(id + '\n')
def _parse_house_floor_xml_legislative_activity(self, xml): """ Parses XML string of House floor updates and yields them in loop. @param xml: XML of field update @type xml: string @return: complete Event object @rtype: Event """ tree = self._xml_parser(xml) congress = tree.xpath('.//legislative_congress')[0].get('congress') house_committees = self._get_current_house_committee_names() for fa in tree.xpath('.//floor_action'): fa_text = fa.xpath('.//action_description')[0].xpath('string()') eastern = pytz.timezone('US/Eastern') dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S') event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')), eastern.localize(dt).astimezone(pytz.utc), 'US/Eastern', '', description=fa_text, classification='floor_update') event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004", note='House Floor', url='http://www.house.gov', coordinates={'latitude': '38.889931', 'longitude': '-77.009003'}) event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')), note="Scraped from the Office of the Clerk, U.S. House of Representatives website.") event.extras['act-id'] = fa.get('act-id') event.extras['unique-id'] = fa.get('unique-id') # bills ai_b = event.add_agenda_item(description='Bills referenced by this update.') for bill in fa.xpath(".//a[@rel='bill']"): bill_name = bill.xpath('string()') ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress), note="Bill was referenced on the House floor.") # publaws ai_p = event.add_agenda_item(description='Public laws referenced by this update.') for law in fa.xpath(".//a[@rel='publaw']"): detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html' ai_p.add_bill(law.xpath('string()'), id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)), note='Law was referenced on the House floor.') # votes ai_v = event.add_agenda_item(description='Votes referenced by this update.') for vote in fa.xpath(".//a[@rel='vote']"): vote_name = vote.xpath('string()') ai_v.add_vote(vote_name, id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress), note='Vote was referenced on the House floor.') # reports for report in fa.xpath(".//a[@rel='report']"): event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html') for name in house_committees: if name.replace('House ', '') in fa_text: event.add_committee(name, id=make_pseudo_id(name=name)) # TODO identify legislators and add them as participants? yield event
def scrape_agenda(self, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf["DATE:"] time = metainf["TIME:"] where = metainf["PLACE:"] # check for duration in time if " - " in time: start, end = time.split(" - ") am_pm_srch = re.search("(?i)(am|pm)", end) if am_pm_srch: time = " ".join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M" ] event_desc = "Meeting Notice" if "Rise" in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper(): return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(name=event_desc, start_date=self._tz.localize(datetime), location_name=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib["href"] event.add_document(bill.text_content(), bill_ft, media_type="application/pdf") root = bill.xpath("../../*") root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = (bill.getparent().getparent().getparent().getnext(). getnext().text_content()) for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill.text_content()) committee = page.xpath("//span[@id='lblSession']")[0].text_content() event.add_participant(committee, "committee", note="host") yield event
def scrape(self): web_results = self.scrapeWebCalendar() for event in self.events(): # Create a key for lookups in the web_results dict. key = (event['EventBodyName'].strip(), self.toTime(event['EventDate']).date(), event['EventTime']) web_event_dict = web_results.get( key, { 'Meeting Details': 'Meeting\xa0details', 'Audio': 'Not\xa0available', 'Recap/Minutes': 'Not\xa0available' }) body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_time=event["start"], timezone=self.TIMEZONE, description='', location_name=event["EventLocation"], status=status) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event_dict['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event_dict['Audio']['url']).headers['Location'] e.add_media_link(note=web_event_dict['Audio']['label'], url=redirect_url, media_type='text/html') if web_event_dict['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event_dict['Recap/Minutes']['label'], url=web_event_dict['Recap/Minutes']['url'], media_type="application/pdf") if web_event_dict['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event_dict['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event_dict['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() if self.short_ids.get(committee): descr = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) else: descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace(".", "").strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib["href"] notice_name = notice.text # the listing page shows the same hearing in multiple rows. # combine these -- get_related_bills() will take care of adding the bills # and descriptions if notice_href in self.seen_hearings: continue else: self.seen_hearings.append(notice_href) when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = TIMEZONE.localize(when) event = Event( name=descr, start_date=when, classification="committee-meeting", description=descr, location_name=where, ) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee and committee in self.short_ids: committee = "{} {}".format( self.chambers[self.short_ids[committee]["chamber"]], self.short_ids[committee]["name"], ) event.add_committee(committee, note="host") event.add_source(URL) event.add_document(notice_name, notice_href, media_type="text/html") for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill["descr"].strip()) a.add_bill(bill["bill_id"], note=bill["type"]) yield event
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells( meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp( int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a')) > 0: meeting_agenda_url = cell_mapping['agenda'].cssselect( 'a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring( meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href', '') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type == 'upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell) > 0: video_url_match = re.search( r"http://(.*?)'", video_cell[0].attrib.get('onclick', '')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell) > 0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell) > 0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get( 'href', ''), mimetype='text/html') yield e
def scrape(self, session=None, start=None, end=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # testimony url, we'll need it later in a loop # testmony query looks gnary but breaks down to: # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129) # $orderby: LastName,FirstName,Organization # $expand: Request # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization, # PresentedDate,FileSize,Topic testimony_url_base = 'http://legislature.maine.gov/backend/' \ 'breeze/data/CommitteeTestimony?' \ '$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and' \ '%20(Request%2FLegislature%20eq%20{})' \ '&$orderby=LastName%2CFirstName%2COrganization&' \ '$expand=Request&$select=Id%2CFileType%2CNamePrefix' \ '%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic' if start is None: start_date = datetime.datetime.now().isoformat() else: start_date = datetime.datetime.strptime(start, "%Y-%m-%d") start_date = start_date.isoformat() # default to 30 days if no end if end is None: dtdelta = datetime.timedelta(days=30) end_date = datetime.datetime.now() + dtdelta end_date = end_date.isoformat() else: end_date = datetime.datetime.strptime(end, "%Y-%m-%d") end_date = end_date.isoformat() bills_by_event = {} bills_url = 'http://legislature.maine.gov/backend/breeze/data/' \ 'getCalendarEventsBills?startDate={}&endDate={}' bills_url = bills_url.format(start_date, end_date) page = json.loads(self.get(bills_url).content) for row in page: bills_by_event.setdefault(row['EventId'], []) bills_by_event[row['EventId']].append(row) # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false url = 'http://legislature.maine.gov/backend/breeze/data/' \ 'getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true' url = url.format(start_date, end_date) page = json.loads(self.get(url).content) for row in page: if row['Cancelled'] is True or row['Postponed'] is True: continue start_date = self._TZ.localize( dateutil.parser.parse( row['FromDateTime'] ) ) end_date = self._TZ.localize( dateutil.parser.parse( row['ToDateTime'] ) ) name = row['CommitteeName'] if name is None: name = row['Host'] address = row['Location'] address = address.replace( 'Cross Building', 'Cross Office Building, 111 Sewall St, Augusta, ME 04330') address = address.replace( 'State House', 'Maine State House, 210 State St, Augusta, ME 04330') event = Event( start_date=start_date, end_date=end_date, name=name, location_name=address, ) event.add_source( 'http://legislature.maine.gov/committee/#Committees/{}' .format(row['CommitteeCode']) ) if bills_by_event.get(row['Id']): for bill in bills_by_event[row['Id']]: description = 'LD {}: {}'.format(bill['LD'], bill['Title']) agenda = event.add_agenda_item(description=description) agenda.add_bill('LD {}'.format(bill['LD'])) if bill['TestimonyCount'] > 0: test_url = testimony_url_base.format(bill['PaperNumber'], session) test_page = json.loads(self.get(test_url).content) for test in test_page: title = '{} {} - {}'.format( test['FirstName'], test['LastName'], test['Organization'] ) if test['NamePrefix'] is not None: title = '{} {}'.format(test['NamePrefix'], title) test_url = 'http://legislature.maine.gov/backend/app/services' \ '/getDocument.aspx?doctype=test&documentId={}'.format(test['Id']) if test['FileType'] == 'pdf': media_type = "application/pdf" event.add_document( note=title, url=test_url, media_type=media_type ) yield event
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): method = 'events/?state={}&dtstart=1776-07-04'.format(self.state) self.events = self.api(method) seen = set() for event in self.events: begin = self._date_parse(event.pop('when')) end = self._date_parse(event.pop('end')) all_day = event.pop('all_day',False) e = Event(name=event.pop('description'), classification=event.pop('type'), location_name=event.pop('location'), timezone=event.pop('timezone'), start_time=begin, end_time=end, all_day=all_day,) if len(e.name) >= 300: e.name = e.name[:290] if len(e.location['name']) >= 100: e.location['name'] = e.location['name'][:90] composite_key = (e.name, e.description, e.start_time) if composite_key in seen: print("Duplicate found: %s/%s/%s" % (composite_key)) continue seen.add(composite_key) for source in event.pop('sources'): if 'retrieved' in source: source.pop('retrieved') e.add_source(**source) if e.sources == []: continue ignore = ['country', 'level', 'state', 'created_at', 'updated_at', 'notes', '+location_url', 'session', 'id', '+chamber', '+agenda', '+cancelled', '+media_contact', '+contact', '+details'] # +agenda: # Agenda on old (very old) OpenStates data is actually a string # and not any sort of structured data we can use in the items # schema, and is only present for a handful of events. for i in ignore: if i in event: event.pop(i) for link in ['+link', 'link']: if link in event: e.add_source(url=event.pop(link)) for p in event.pop('participants', []): type_ = { "committee": "organization", "legislator": "person", None: None, }[p.get('participant_type')] if type_ is None: # Garbage data. continue e.add_participant(name=p['participant'], note=p['type'], type=type_,) for b in event.pop('related_bills', []): item = e.add_agenda_item( b.pop('description', b.pop('+description', None))) item.add_bill(bill=b['bill_id'], note=b.pop('type', b.pop('+type', None))) seen_documents = set([]) for document in event.pop('documents', []): if document['url'] in seen_documents: print("XXX: Buggy data in: Duped Document URL: %s (%s)" % ( document['url'], document['name'] )) continue seen_documents.add(document['url']) e.add_document(url=document['url'], note=document['name']) assert event == {}, "Unknown fields: %s" % ( ", ".join(event.keys()) ) yield e
def scrape(self): method = 'events/?state={}&dtstart=1776-07-04'.format(self.state) self.events = self.api(method) seen = set() for event in self.events: e = Event(name=event.pop('description'), classification=event.pop('type'), location=event.pop('location'), timezone=event.pop('timezone'), start_time=self._date_parse(event.pop('when')), end_time=self._date_parse(event.pop('end')),) if len(e.name) >= 300: e.name = e.name[:290] if len(e.location['name']) >= 100: e.location['name'] = e.location['name'][:90] composite_key = (e.name, e.description, e.start_time) if composite_key in seen: print("Duplicate found: %s/%s/%s" % (composite_key)) continue seen.add(composite_key) for source in event.pop('sources'): if 'retrieved' in source: source.pop('retrieved') e.add_source(**source) if e.sources == []: continue ignore = ['country', 'level', 'state', 'created_at', 'updated_at', 'notes', '+location_url', 'session', 'id', '+chamber', '+agenda', '+cancelled', '+media_contact', '+contact', '+details'] # +agenda: # Agenda on old (very old) OpenStates data is actually a string # and not any sort of structured data we can use in the items # schema, and is only present for a handful of events. for i in ignore: if i in event: event.pop(i) for link in ['+link', 'link']: if link in event: e.add_source(url=event.pop(link)) for p in event.pop('participants', []): type_ = { "committee": "organization", "legislator": "person", None: None, }[p.get('participant_type')] if type_ is None: # Garbage data. continue e.add_participant(name=p['participant'], note=p['type'], type=type_,) for b in event.pop('related_bills', []): item = e.add_agenda_item( b.pop('description', b.pop('+description', None))) item.add_bill(bill=b['bill_id'], note=b.pop('type', b.pop('+type', None))) seen_documents = set([]) for document in event.pop('documents', []): if document['url'] in seen_documents: print("XXX: Buggy data in: Duped Document URL: %s (%s)" % ( document['url'], document['name'] )) continue seen_documents.add(document['url']) e.add_document(url=document['url'], note=document['name']) assert event == {}, "Unknown fields: %s" % ( ", ".join(event.keys()) ) yield e
def scrape_agenda(self, url): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] # check for duration in time if ' - ' in time: start, end = time.split(' - ') am_pm_srch = re.search('(?i)(am|pm)', end) if am_pm_srch: time = ' '.join([start, am_pm_srch.group().upper()]) else: time = start fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] event_desc = "Meeting Notice" if 'Rise' in time: datetime = date event_desc = "Meeting Notice: Starting at {}".format(time) else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime.upper(): return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event( name=event_desc, start_date=self._tz.localize(datetime), location_name=where, ) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document( bill.text_content(), bill_ft, media_type="application/pdf") root = bill.xpath('../../*') root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext().text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) item = event.add_agenda_item(descr) item.add_bill(bill.text_content()) committee = page.xpath("//span[@id='lblSession']")[0].text_content() event.add_participant(committee, 'committee', note='host') yield event
def scrape(self): for event, web_event in self.events(): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event['Audio']['url']).headers['Location'] e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, chamber=None, session=None): """ Scrape the events data from all dates from the sc meetings page, then create and yield the events objects from the data. :param chamber: :param session: :return: yielded Event objects """ chambers = { 'upper': {'name': 'Senate', 'title': 'Senator'}, 'lower': {'name': 'House', 'title': 'Representative'}, } if chamber == 'other': return if chamber is None: self.info('no chamber specified, using Joint Committee Meeting Schedule') events_url = 'http://www.scstatehouse.gov/meetings.php' else: events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % ( chambers[chamber]['name'].upper()[0] ) page = self.get_page_from_url(events_url) meeting_year = page.xpath( '//h2[@class="barheader"]/span')[0].text_content() meeting_year = re.search( r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})', meeting_year).group(1) dates = page.xpath("//div[@id='contentsection']/ul") for date in dates: date_string = date.xpath('span') if len(date_string) == 1: date_string = date_string[0].text_content() else: continue # If a event is in the next calendar year, the date_string # will have a year in it if date_string.count(",") == 2: event_year = date_string[-4:] date_string = date_string[:-6] elif date_string.count(",") == 1: event_year = meeting_year else: raise AssertionError("This is not a valid date: '{}'"). \ format(date_string) for meeting in date.xpath('li'): time_string = meeting.xpath('span')[0].text_content() if time_string == 'CANCELED' or len( meeting.xpath( './/span[contains(text(), "CANCELED")]')) > 0: continue time_string = normalize_time(time_string) date_time = datetime.datetime.strptime( event_year + ' ' + date_string + ' ' + time_string, "%Y %A, %B %d %I:%M %p") date_time = self._tz.localize(date_time) meeting_info = meeting.xpath( 'br[1]/preceding-sibling::node()')[1] location, description = re.search( r'-- (.*?) -- (.*)', meeting_info).groups() # if re.search(r'committee', description, re.I): # meeting_type = 'committee:meeting' # else: # meeting_type = 'other:meeting' event = Event(name=description, # Event Name start_date=date_time, # When the event will take place location_name=location) # Where the event will be event.add_source(events_url) agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]") if agenda_url: agenda_url = agenda_url[0].attrib['href'] event.add_source(agenda_url) event.add_document(note="Agenda", url=agenda_url, media_type="application/pdf") agenda_page = self.get_page_from_url(agenda_url) for bill in agenda_page.xpath( ".//a[contains(@href,'billsearch.php')]"): # bill_url = bill.attrib['href'] bill_id = bill.text_content().replace( '.', '').replace(' ', '') # bill_description = self.get_bill_description(bill_url) event.add_bill(bill_id) yield event