def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore" ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link( 'Video of Hearing', video[0].xpath('@href')[0], 'text/html' ) if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def test_full_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America", all_day=True) event.add_person("George Washington") event.add_media_link("fireworks", "http://example.com/fireworks.mov") EventImporter('jid').import_data([event.as_dict()])
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source( 'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document(description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore") for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link('Video of Hearing', video[0].xpath('@href')[0], 'text/html') if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape(self): url = 'https://lims.minneapolismn.gov/Calendar/GetCalenderList?' council_events = cal_list for c in council_events: mtg_time = datetime.strptime(c['MeetingTime'], CAL_DATE_FORMAT) dt = tz.localize(mtg_time) e = Event(name=c['CommitteeName'], start_date=dt, location_name=c['Location']) e.add_committee(c['CommitteeName']) e.add_source(url) if c['MarkedAgendaPublished'] == True: event_url = "{0}{1}/{2}".format(AGENDA_BASE_URL, c['Abbreviation'], c['AgendaId']) e.add_media_link(note="Agenda", url=event_url, media_type="link") yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, event in self.events(n_days_ago): when = api_event['start'] location = api_event['EventLocation'] description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = None if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=api_event['status']) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=api_event['status']) e.pupa_id = str(api_event['EventId']) if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') location_string = event[u'Meeting Location'] location_notes, other_orgs = self._parse_location(location_string) if location_notes: e.extras = {'location note': ' '.join(location_notes)} if e.name == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in e.name.lower(): participating_orgs = [e.name] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName'].strip()) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self, window=3): n_days_ago = (datetime.datetime.utcnow() - datetime.timedelta(float(window))) for api_event, event in self.events(n_days_ago): description = None when = api_event['start'] location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location: continue status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1]: status_text = status_string[1].lower() if any(phrase in status_text for phrase in ( 'rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for', )): status = 'cancelled' elif status_text in ('rescheduled', 'recessed'): status = 'cancelled' elif status_text in ( 'meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting', ): status = api_event['status'] elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time'): status = api_event['status'] elif 'room' in status_text: location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date', ): continue else: print(status_text) description = status_string[1].replace('--em--', '').strip() status = api_event['status'] else: status = api_event['status'] if description: e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone=self.TIMEZONE, location_name=location, status=status) else: e = Event(name=event["Name"]["label"], start_time=when, timezone=self.TIMEZONE, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if event['Video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Video']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council': participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)': participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(since_datetime=n_days_ago) service_councils = set(sc['BodyId'] for sc in self.search( '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75')) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] elif event['EventBodyId'] in service_councils: # Don't scrape service council or service council public hearing events. self.info('Skipping event {0} for {1}'.format( event['EventId'], event['EventBodyName'])) continue else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if web_event.has_ecomment: self.info('Adding eComment link {0} from {1}'.format( web_event['eComment'], web_event['Meeting Details']['url'])) e.extras['ecomment'] = web_event['eComment'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes agenda_number = item["EventItemAgendaNumber"] note = "Agenda number, {}".format(agenda_number) agenda_item['notes'].append(note) agenda_item['extras']['agenda_number'] = agenda_number # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document( note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventAgendaLastPublishedUTC']).date()) if event['EventMinutesFile']: e.add_document( note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf", date=self.to_utc_timestamp( event['EventMinutesLastPublishedUTC']).date()) elif web_event['Published minutes'] != 'Not\xa0available': e.add_document(note=web_event['Published minutes']['label'], url=web_event['Published minutes']['url'], media_type="application/pdf") else: approved_minutes = self.find_approved_minutes(event) if approved_minutes: e.add_document( note=approved_minutes['MatterAttachmentName'], url=approved_minutes['MatterAttachmentHyperlink'], media_type="application/pdf", date=self.to_utc_timestamp( approved_minutes['MatterAttachmentLastModifiedUtc'] ).date()) for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue # Sometimes if there is an issue getting the Spanish # audio created, Metro has the Spanish Audio link # go to the English Audio. # # Pupa does not allow the for duplicate media links, # so we'll ignore the the second media link if it's # the same as the first media link. # # Because of the way that the event['audio'] is created # the first audio link is always English and the # second is always Spanish e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html', on_duplicate='ignore') if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): for event, agenda in self.events() : description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = confirmedOrPassed(when) elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = confirmedOrPassed(when) else : status = confirmedOrPassed(when) if description : e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") if agenda : e.add_source(event['Meeting Details']['url'], note='web') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"] : identifier = item["Record #"]['label'] if identifier.startswith('S'): identifier = identifier[1:] agenda_item.add_bill(identifier) else : e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda] if len(item_agenda_sequences) != len(set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError(error_msg.format(event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): web_results = self.scrapeWebCalendar() for event in self.events(): # Create a key for lookups in the web_results dict. key = (event['EventBodyName'].strip(), self.toTime(event['EventDate']).date(), event['EventTime']) web_event_dict = web_results.get( key, { 'Meeting Details': 'Meeting\xa0details', 'Audio': 'Not\xa0available', 'Recap/Minutes': 'Not\xa0available' }) body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_time=event["start"], timezone=self.TIMEZONE, description='', location_name=event["EventLocation"], status=status) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event_dict['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event_dict['Audio']['url']).headers['Location'] e.add_media_link(note=web_event_dict['Audio']['label'], url=redirect_url, media_type='text/html') if web_event_dict['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event_dict['Recap/Minutes']['label'], url=web_event_dict['Recap/Minutes']['url'], media_type="application/pdf") if web_event_dict['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event_dict['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event_dict['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for api_event, web_event in self.events(n_days_ago): when = api_event['start'] location = api_event[u'EventLocation'] extracts = self._parse_comment(api_event[u'EventComment']) description, room, status, invalid_event = extracts if invalid_event: continue if room: location = room + ', ' + location if not status: status = api_event['status'] if description: e = Event(name=api_event["EventBodyName"], start_date=when, description=description, location_name=location, status=status) else: e = Event(name=api_event["EventBodyName"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if web_event['Meeting video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=web_event['Meeting video']['url'], type="recording", media_type='text/html') self.addDocs(e, web_event, 'Published agenda') self.addDocs(e, web_event, 'Notice') self.addDocs(e, web_event, 'Published summary') if 'Captions' in web_event: self.addDocs(e, web_event, 'Captions') participant = api_event["EventBodyName"] if participant == 'City Council': participant = 'Seattle City Council' # elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)': # participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') e.add_source(web_event['Meeting Name']['url'], note='web') yield e
def scrape(self): for event, web_event in self.events(): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = '' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': redirect_url = self.head( web_event['Audio']['url']).headers['Location'] e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name # Events can have an EventAgendaStatusName of "Final", "Final Revised", # and "Final 2nd Revised." # We classify these events as "passed." status_name = event['EventAgendaStatusName'] if status_name.startswith('Final'): status = 'passed' elif status_name == 'Draft': status = 'confirmed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} legistar_api_url = self.BASE_URL + '/events/{0}'.format( event['EventId']) e.add_source(legistar_api_url, note='api') if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] if 'event_details' in event: # if there is not a meeting detail page on legistar # don't capture the agenda data from the API for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) # The EventItemAgendaSequence provides # the line number of the Legistar agenda grid. agenda_item['extras']['item_agenda_sequence'] = item[ 'EventItemAgendaSequence'] # Historically, the Legistar system has duplicated the EventItemAgendaSequence, # resulting in data inaccuracies. The scrape should fail in such cases, until Metro # cleans the data. item_agenda_sequences = [ item['extras']['item_agenda_sequence'] for item in e.agenda ] if len(item_agenda_sequences) != len( set(item_agenda_sequences)): error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \ {event_name} on {event_date} ({legistar_api_url}). \ Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.' raise ValueError( error_msg.format( event_name=e.name, event_date=e.start_date.strftime("%B %d, %Y"), legistar_api_url=legistar_api_url)) e.add_participant(name=body_name, type="organization") if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=30): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) self.retry_wait_seconds = 20 for api_event, event in self.events(n_days_ago): description = api_event["EventComment"] when = api_event["start"] location = api_event["EventLocation"] if location == "Council Chambers": location = "Council Chambers, 5th Floor, City-County Building, " \ "414 Grant Street, Pittsburgh, PA 15219" if not location : continue status_string = api_event["status"] if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ("rescheduled to", "postponed to", "reconvened to", "rescheduled to", "meeting recessed", "recessed meeting", "postponed to", "recessed until", "deferred", "time change", "date change", "recessed meeting - reconvene", "cancelled", "new date and time", "rescheduled indefinitely", "rescheduled for",)) : status = "cancelled" elif status_text in ("rescheduled", "recessed") : status = "cancelled" elif status_text in ("meeting reconvened", "reconvened meeting", "recessed meeting", "reconvene meeting", "rescheduled hearing", "rescheduled meeting",) : status = api_event["status"] elif status_text in ("amended notice of meeting", "room change", "amended notice", "change of location", "revised - meeting date and time") : status = api_event["status"] elif "room" in status_text : location = status_string[1] + ", " + location elif status_text in ("wrong meeting date",): continue else : print(status_text) status = api_event["status"] else : status = api_event["status"] if event["Meeting Name"] == "Post Agenda": event_name = "Agenda Announcement" elif event["Meeting Name"] == "City Council": event_name = "Regular meeting" else: event_name = event["Meeting Name"] if description: e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) else: e = Event(name=event_name, start_date=when, location_name=location, status=status) e.pupa_id = str(api_event["EventId"]) if event["Meeting video"] != "Not\xa0available": if "url" not in event["Meeting video"]: pass else: video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) e.add_media_link(note="Recording", url=video_url, type="recording", media_type="text/html") self.addDocs(e, event, "Published agenda") self.addDocs(e, event, "Published minutes") participant = event["Meeting Name"] if participant == "City Council" or participant == "Post Agenda": participant = "Pittsburgh City Council" e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): clean_title = self.clean_agenda_item_title(item["EventItemTitle"]) agenda_item = e.add_agenda_item(clean_title) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemVideo"] and event["Meeting video"] != "Not\xa0available": item_video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) + \ '?view_id=2&meta_id=' + str(item["EventItemVideo"]) agenda_item.add_media_link(note="Recording", url=item_video_url, type="recording", media_type="text/html") participants = set() for call in self.rollcalls(api_event): if call["RollCallValueName"] == "Present": participants.add(call["RollCallPersonName"]) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event), note="api") try: detail_url = event["Meeting Details"]["url"] except TypeError: e.add_source(self.EVENTSPAGE, note="web") else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note="web") yield e
def scrape(self, follow_links=True): for page in self.eventPages(EVENTSPAGE): events_table = page.xpath("//table[@class='rgMasterTable']")[0] for events, headers, rows in self.parseDataTable(events_table) : if follow_links and type(events['Meeting\xa0Details']) == dict : detail_url = events['Meeting\xa0Details']['url'] meeting_details = self.lxmlize(detail_url) agenda_table = meeting_details.xpath( "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0] agenda = self.parseDataTable(agenda_table) location_string = events[u'Meeting\xa0Location'] location_list = location_string.split('--') location = ', '.join(location_list[0:2]) when = events[u'Meeting\xa0Date'] time_string = events[u'Meeting\xa0Time'] event_time = datetime.datetime.strptime(time_string, "%I:%M %p") when = when.replace(hour=event_time.hour) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'recessed', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for')) : status = 'cancelled' elif status_text in ('rescheduled') : status = 'cancelled' else : print(status_text) elif datetime.datetime.utcnow().replace(tzinfo = pytz.utc) > when : status = 'confirmed' else : status = 'passed' e = Event(name=events["Name"]["label"], start_time=when, timezone='US/Central', location=location, status=status) e.add_source(detail_url) if events['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = events['Video']['url'], type="recording", media_type = 'text/html') addDocs(e, events, 'Agenda') addDocs(e, events, 'Notice') addDocs(e, events, 'Transcript') addDocs(e, events, 'Summary') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) agenda_item.add_bill(item["Record #"]['label']) e.add_participant(name=events["Name"]["label"], type="organization") yield e
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2011) : other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location'] : location_string, note = event[u'Meeting Location'].split('--em--')[:2] for each in note.split(' - ') : if each.startswith('Join') : other_orgs = each else : extras.append(each) else : location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred',) : status = 'cancelled' elif self.now() < when : status = 'confirmed' else : status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')) : description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events : continue else : last_events.append(event_id) e = Event(name=event_name, start_time=when, timezone=self.TIMEZONE, description=description, location_name=location, status=status) if extras : e.extras = {'location note' : ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Multimedia']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting' : participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower() : participating_orgs = [event["Name"]] else : participating_orgs = [] if other_orgs : other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs : e.add_committee(name=org) if agenda : e.add_source(event["Meeting Details"]['url']) for item, _, _ in agenda : if item["Name"] : agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"] : if item['Action'] : note = item['Action'] else : note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else : e.add_source(self.EVENTSPAGE) yield e
def scrape(self): for event, agenda in self.events() : description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = confirmedOrPassed(when) elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = confirmedOrPassed(when) else : status = confirmedOrPassed(when) if description : e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") participants = set() if agenda : e.add_source(event['Meeting Details']['url'], note='web') for item, _, _ in agenda : agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"] : identifier = item["Record #"]['label'] if identifier.startswith('S'): identifier = identifier[1:] agenda_item.add_bill(identifier) elif ('label' in item['Action\xa0Details'] and item['Action\xa0Details']['label'] == 'Roll\xa0call'): roll_call = self.extractRollCall(item['Action\xa0Details']['url']) for attendance, person in roll_call: if attendance == 'Present': participants.add(person) for person in participants: e.add_participant(name=person, type="person") else : e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a'))>0: meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href','') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type=='upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell)>0: video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick','')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell)>0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell)>0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html') yield e
def scrape(self, window=None): if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) else: n_days_ago = None for event, web_event in self.events(n_days_ago): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [ part.strip() for part in body_name.split('-') ] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' e = Event(event_name, start_date=event["start"], description='', location_name=event["EventLocation"], status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links e.extras = {'guid': event['EventGuid']} for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format( item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event), note='api') if event['EventAgendaFile']: e.add_document(note='Agenda', url=event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note='Minutes', url=event['EventMinutesFile'], media_type="application/pdf") # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists. if web_event['Audio'] != 'Not\xa0available': try: redirect_url = self.head( web_event['Audio']['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet contain the # location of the audio file. Skip these events, and retry # on next scrape. continue e.add_media_link(note=web_event['Audio']['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if web_event['Meeting Details'] != 'Meeting\xa0details': if requests.head(web_event['Meeting Details'] ['url']).status_code == 200: e.add_source(web_event['Meeting Details']['url'], note='web') else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self): last_events = deque(maxlen=10) for event, agenda in self.events(since=2017): other_orgs = '' extras = [] if '--em--' in event[u'Meeting Location']: location_string, note = event[u'Meeting Location'].split( '--em--')[:2] for each in note.split(' - '): if each.startswith('Join'): other_orgs = each else: extras.append(each) else: location_string = event[u'Meeting Location'] location_list = location_string.split('-', 2) location = ', '.join([each.strip() for each in location_list[0:2]]) if not location: continue when = self.toTime(event[u'Meeting Date']) response = self.get(event['iCalendar']['url'], verify=False) event_time = self.ical( response.text).subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) time_string = event['Meeting Time'] if time_string in ('Deferred', ): status = 'cancelled' elif self.now() < when: status = 'confirmed' else: status = 'passed' description = event['Meeting\xa0Topic'] if any(each in description for each in ('Multiple meeting items', 'AGENDA TO BE ANNOUNCED')): description = '' event_name = event['Name'] event_id = (event_name, when) if event_id in last_events: continue else: last_events.append(event_id) e = Event(name=event_name, start_date=when, description=description, location_name=location, status=status) if extras: e.extras = {'location note': ' '.join(extras)} if event['Multimedia'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Multimedia']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Minutes') if event['Name'] == 'City Council Stated Meeting': participating_orgs = ['New York City Council'] elif 'committee' in event['Name'].lower(): participating_orgs = [event["Name"]] else: participating_orgs = [] if other_orgs: other_orgs = re.sub('Jointl*y with the ', '', other_orgs) participating_orgs += re.split(' and the |, the ', other_orgs) for org in participating_orgs: e.add_committee(name=org) if agenda: e.add_source(event["Meeting Details"]['url'], note='web') for item, _, _ in agenda: if item["Name"]: agenda_item = e.add_agenda_item(item["Name"]) if item["File\xa0#"]: if item['Action']: note = item['Action'] else: note = 'consideration' agenda_item.add_bill(item["File\xa0#"]['label'], note=note) else: e.add_source(self.EVENTSPAGE, note='web') yield e
def scrape(self): for event, agenda in self.events(): description = None location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location: continue when = self.toTime(event[u'Meeting Date']) event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt when = when.replace(hour=event_time.hour, minute=event_time.minute) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1]: status_text = status_string[1].lower() if any(phrase in status_text for phrase in ( 'rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for', )): status = 'cancelled' elif status_text in ('rescheduled', 'recessed'): status = 'cancelled' elif status_text in ( 'meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting', ): status = confirmedOrPassed(when) elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time'): status = confirmedOrPassed(when) elif 'room' in status_text: location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date', ): continue else: print(status_text) description = status_string[1] status = confirmedOrPassed(when) else: status = confirmedOrPassed(when) if description: e = Event(name=event["Name"]["label"], start_time=when, description=description, timezone='US/Central', location_name=location, status=status) else: e = Event(name=event["Name"]["label"], start_time=when, timezone='US/Central', location_name=location, status=status) if event['Video'] != 'Not\xa0available': e.add_media_link(note='Recording', url=event['Video']['url'], type="recording", media_type='text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Transcript') self.addDocs(e, event, 'Summary') e.add_participant(name=event["Name"]["label"], type="organization") if agenda: e.add_source(event['Name']['url']) for item, _, _ in agenda: agenda_item = e.add_agenda_item(item["Title"]) if item["Record #"]: agenda_item.add_bill(item["Record #"]['label']) else: e.add_source(self.EVENTSPAGE) yield e
def scrape(self): meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE) meetings_lxml = lxml.html.fromstring(meetings_html) for meeting_type in ('archive', 'upcoming'): for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type): # attempt to map the cells across table types. # if the sizes mismatch, ignore this one (it's an "empty" message) try: cell_mapping = self._organize_cells( meeting_type, meeting.cssselect('td')) except: continue meeting_title = cell_mapping['title'].text meeting_date = datetime.datetime.fromtimestamp( int(cell_mapping['date'].cssselect('span')[0].text)) e = Event(name=meeting_title, when=meeting_date, location='unknown') e.add_source(self.ARLINGTON_MEETING_PAGE) # detect agenda url, if present meeting_agenda_url = None if len(cell_mapping['agenda'].cssselect('a')) > 0: meeting_agenda_url = cell_mapping['agenda'].cssselect( 'a')[0].attrib.get('href') # follow the agenda URL and attempt to extract associated documents if meeting_agenda_url is not None: e.add_link(meeting_agenda_url) e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html') meeting_agenda_html = self.urlopen(meeting_agenda_url) meeting_agenda_lxml = lxml.html.fromstring( meeting_agenda_html) for link in meeting_agenda_lxml.cssselect('a'): link_url = link.attrib.get('href', '') if not len(link_url): continue if 'metaviewer.php' in link_url.lower(): # NOTE: application/pdf is a guess, may not always be correct if link.text is not None: e.add_document(name=link.text, url=link_url, mimetype='application/pdf') # skip everything below here for the 'upcoming' table if meeting_type == 'upcoming': continue # detect video # TODO: extract actual mp4 files video_cell = cell_mapping['video'].cssselect('a') if len(video_cell) > 0: video_url_match = re.search( r"http://(.*?)'", video_cell[0].attrib.get('onclick', '')) if video_url_match is not None: e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html') # detect audio audio_cell = cell_mapping['audio'].cssselect('a') if len(audio_cell) > 0: e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg') # detect minutes minutes_cell = cell_mapping['minutes'].cssselect('a') if len(minutes_cell) > 0: e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get( 'href', ''), mimetype='text/html') yield e
def scrape(self, window=None) : if window: n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) else: n_days_ago = None events = self.events(n_days_ago) for event, web_event in self._merge_events(events): body_name = event["EventBodyName"] if 'Board of Directors -' in body_name: body_name, event_name = [part.strip() for part in body_name.split('-')] else: event_name = body_name status_name = event['EventAgendaStatusName'] if status_name == 'Draft': status = 'confirmed' elif status_name == 'Final': status = 'passed' elif status_name == 'Canceled': status = 'cancelled' else: status = 'tentative' location = event["EventLocation"] if not location: # We expect some events to have no location. LA Metro would # like these displayed in the Councilmatic interface. However, # OCD requires a value for this field. Add a sane default. location = 'Not available' e = Event(event_name, start_date=event["start"], description='', location_name=location, status=status) e.pupa_id = str(event['EventId']) # Metro requires the EventGuid to build out MediaPlayer links. # Add both the English event GUID, and the Spanish event GUID if # it exists, to the extras dict. e.extras = {'guid': event['EventGuid']} if event.get('SAPEventGuid'): e.extras['sap_guid'] = event['SAPEventGuid'] for item in self.agenda(event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) if item["EventItemAgendaNumber"]: # To the notes field, add the item number as given in the agenda minutes note = "Agenda number, {}".format(item["EventItemAgendaNumber"]) agenda_item['notes'].append(note) e.add_participant(name=body_name, type="organization") e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']), note='api') if event.get('SAPEventId'): e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']), note='api (sap)') if event['EventAgendaFile']: e.add_document(note= 'Agenda', url = event['EventAgendaFile'], media_type="application/pdf") if event['EventMinutesFile']: e.add_document(note= 'Minutes', url = event['EventMinutesFile'], media_type="application/pdf") for audio in event['audio']: try: redirect_url = self.head(audio['url']).headers['Location'] except KeyError: # In some cases, the redirect URL does not yet # contain the location of the audio file. Skip # these events, and retry on next scrape. continue e.add_media_link(note=audio['label'], url=redirect_url, media_type='text/html') if web_event['Recap/Minutes'] != 'Not\xa0available': e.add_document(note=web_event['Recap/Minutes']['label'], url=web_event['Recap/Minutes']['url'], media_type="application/pdf") if event['event_details']: for link in event['event_details']: e.add_source(**link) else: e.add_source('https://metro.legistar.com/Calendar.aspx', note='web') yield e
def scrape(self, window=3): n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for api_event, event in self.events(n_days_ago): description = None when = api_event['start'] location_string = event[u'Meeting Location'] location_list = location_string.split('--', 2) location = ', '.join(location_list[0:2]) if not location : continue status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status_text = status_string[1].lower() if any(phrase in status_text for phrase in ('rescheduled to', 'postponed to', 'reconvened to', 'rescheduled to', 'meeting recessed', 'recessed meeting', 'postponed to', 'recessed until', 'deferred', 'time change', 'date change', 'recessed meeting - reconvene', 'cancelled', 'new date and time', 'rescheduled indefinitely', 'rescheduled for',)) : status = 'cancelled' elif status_text in ('rescheduled', 'recessed') : status = 'cancelled' elif status_text in ('meeting reconvened', 'reconvened meeting', 'recessed meeting', 'reconvene meeting', 'rescheduled hearing', 'rescheduled meeting',) : status = api_event['status'] elif status_text in ('amended notice of meeting', 'room change', 'amended notice', 'change of location', 'revised - meeting date and time') : status = api_event['status'] elif 'room' in status_text : location = status_string[1] + ', ' + location elif status_text in ('wrong meeting date',) : continue else : print(status_text) description = status_string[1].replace('--em--', '').strip() status = api_event['status'] else : status = api_event['status'] if description : e = Event(name=event["Name"]["label"], start_date=when, description=description, location_name=location, status=status) else : e = Event(name=event["Name"]["label"], start_date=when, location_name=location, status=status) e.pupa_id = str(api_event['EventId']) if event['Video'] != 'Not\xa0available' : e.add_media_link(note='Recording', url = event['Video']['url'], type="recording", media_type = 'text/html') self.addDocs(e, event, 'Agenda') self.addDocs(e, event, 'Notice') self.addDocs(e, event, 'Captions') self.addDocs(e, event, 'Summary') participant = event["Name"]["label"] if participant == 'City Council' : participant = 'Chicago City Council' elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' : participant = 'Committee on Energy, Environmental Protection and Public Utilities' e.add_participant(name=participant, type="organization") for item in self.agenda(api_event): agenda_item = e.add_agenda_item(item["EventItemTitle"]) if item["EventItemMatterFile"]: identifier = item["EventItemMatterFile"] agenda_item.add_bill(identifier) participants = set() for call in self.rollcalls(api_event): if call['RollCallValueName'] == 'Present': participants.add(call['RollCallPersonName']) for person in participants: e.add_participant(name=person, type="person") e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), note='api') try: detail_url = event['Meeting Details']['url'] except TypeError: e.add_source(self.EVENTSPAGE, note='web') else: if requests.head(detail_url).status_code == 200: e.add_source(detail_url, note='web') yield e