Beispiel #1
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Beispiel #2
0
def test_full_event():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America",
                        all_day=True)
    event.add_person("George Washington")
    event.add_media_link("fireworks", "http://example.com/fireworks.mov")

    EventImporter('jid').import_data([event.as_dict()])
Beispiel #3
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source(
            'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(description,
                               item.xpath('@href')[0],
                               media_type="application/pdf",
                               on_duplicate="ignore")

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link('Video of Hearing',
                                 video[0].xpath('@href')[0], 'text/html')

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Beispiel #4
0
 def scrape(self):
     url = 'https://lims.minneapolismn.gov/Calendar/GetCalenderList?'
     council_events = cal_list
     for c in council_events:
         mtg_time = datetime.strptime(c['MeetingTime'], CAL_DATE_FORMAT)
         dt = tz.localize(mtg_time)
         e = Event(name=c['CommitteeName'],
                   start_date=dt,
                   location_name=c['Location'])
         e.add_committee(c['CommitteeName'])
         e.add_source(url)
         if c['MarkedAgendaPublished'] == True:
             event_url = "{0}{1}/{2}".format(AGENDA_BASE_URL,
                                             c['Abbreviation'],
                                             c['AgendaId'])
             e.add_media_link(note="Agenda",
                              url=event_url,
                              media_type="link")
         yield e
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event['EventLocation']

            description = event['Meeting\xa0Topic']

            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = None

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=api_event['status'])
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=api_event['status'])

            e.pupa_id = str(api_event['EventId'])

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            location_string = event[u'Meeting Location']
            location_notes, other_orgs = self._parse_location(location_string)

            if location_notes:
                e.extras = {'location note': ' '.join(location_notes)}

            if e.name == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in e.name.lower():
                participating_orgs = [e.name]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()

            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'].strip())

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e
Beispiel #6
0
    def scrape(self, window=3):
        n_days_ago = (datetime.datetime.utcnow() -
                      datetime.timedelta(float(window)))

        for api_event, event in self.events(n_days_ago):

            description = None

            when = api_event['start']
            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location:
                continue

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1]:
                status_text = status_string[1].lower()
                if any(phrase in status_text for phrase in (
                        'rescheduled to',
                        'postponed to',
                        'reconvened to',
                        'rescheduled to',
                        'meeting recessed',
                        'recessed meeting',
                        'postponed to',
                        'recessed until',
                        'deferred',
                        'time change',
                        'date change',
                        'recessed meeting - reconvene',
                        'cancelled',
                        'new date and time',
                        'rescheduled indefinitely',
                        'rescheduled for',
                )):
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed'):
                    status = 'cancelled'
                elif status_text in (
                        'meeting reconvened',
                        'reconvened meeting',
                        'recessed meeting',
                        'reconvene meeting',
                        'rescheduled hearing',
                        'rescheduled meeting',
                ):
                    status = api_event['status']
                elif status_text in ('amended notice of meeting',
                                     'room change', 'amended notice',
                                     'change of location',
                                     'revised - meeting date and time'):
                    status = api_event['status']
                elif 'room' in status_text:
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date', ):
                    continue
                else:
                    print(status_text)
                    description = status_string[1].replace('--em--',
                                                           '').strip()
                    status = api_event['status']
            else:
                status = api_event['status']

            if description:
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone=self.TIMEZONE,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone=self.TIMEZONE,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event['EventId'])

            if event['Video'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Video']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council':
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)':
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant, type="organization")

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()
            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'])

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e
Beispiel #7
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(since_datetime=n_days_ago)

        service_councils = set(sc['BodyId'] for sc in self.search(
            '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75'))

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            elif event['EventBodyId'] in service_councils:
                # Don't scrape service council or service council public hearing events.
                self.info('Skipping event {0} for {1}'.format(
                    event['EventId'], event['EventBodyName']))
                continue
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if web_event.has_ecomment:
                self.info('Adding eComment link {0} from {1}'.format(
                    web_event['eComment'],
                    web_event['Meeting Details']['url']))
                e.extras['ecomment'] = web_event['eComment']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        agenda_number = item["EventItemAgendaNumber"]
                        note = "Agenda number, {}".format(agenda_number)
                        agenda_item['notes'].append(note)

                        agenda_item['extras']['agenda_number'] = agenda_number

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(
                    note='Agenda',
                    url=event['EventAgendaFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventAgendaLastPublishedUTC']).date())

            if event['EventMinutesFile']:
                e.add_document(
                    note='Minutes',
                    url=event['EventMinutesFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventMinutesLastPublishedUTC']).date())
            elif web_event['Published minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Published minutes']['label'],
                               url=web_event['Published minutes']['url'],
                               media_type="application/pdf")
            else:
                approved_minutes = self.find_approved_minutes(event)
                if approved_minutes:
                    e.add_document(
                        note=approved_minutes['MatterAttachmentName'],
                        url=approved_minutes['MatterAttachmentHyperlink'],
                        media_type="application/pdf",
                        date=self.to_utc_timestamp(
                            approved_minutes['MatterAttachmentLastModifiedUtc']
                        ).date())

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                # Sometimes if there is an issue getting the Spanish
                # audio created, Metro has the Spanish Audio link
                # go to the English Audio.
                #
                # Pupa does not allow the for duplicate media links,
                # so we'll ignore the the second media link if it's
                # the same as the first media link.
                #
                # Because of the way that the event['audio'] is created
                # the first audio link is always English and the
                # second is always Spanish
                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html',
                                 on_duplicate='ignore')

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
    def scrape(self):
        for event, agenda in self.events() :

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text 
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = confirmedOrPassed(when)
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = confirmedOrPassed(when)
            else :
                status = confirmedOrPassed(when)


            if description :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)


            if event['Video'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            if agenda :
                e.add_source(event['Meeting Details']['url'], note='web')
                
                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"] :
                        identifier = item["Record #"]['label']
                        if identifier.startswith('S'):
                            identifier = identifier[1:]
                        agenda_item.add_bill(identifier)

            else :
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
Beispiel #10
0
    def scrape(self):
        web_results = self.scrapeWebCalendar()

        for event in self.events():
            # Create a key for lookups in the web_results dict.
            key = (event['EventBodyName'].strip(),
                   self.toTime(event['EventDate']).date(), event['EventTime'])

            web_event_dict = web_results.get(
                key, {
                    'Meeting Details': 'Meeting\xa0details',
                    'Audio': 'Not\xa0available',
                    'Recap/Minutes': 'Not\xa0available'
                })

            body_name = event["EventBodyName"]
            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = ''

            e = Event(event_name,
                      start_time=event["start"],
                      timezone=self.TIMEZONE,
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event_dict['Audio'] != 'Not\xa0available':

                redirect_url = self.head(
                    web_event_dict['Audio']['url']).headers['Location']

                e.add_media_link(note=web_event_dict['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event_dict['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event_dict['Recap/Minutes']['label'],
                               url=web_event_dict['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event_dict['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event_dict['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event_dict['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
Beispiel #11
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for api_event, web_event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event[u'EventLocation']

            extracts = self._parse_comment(api_event[u'EventComment'])
            description, room, status, invalid_event = extracts

            if invalid_event:
                continue

            if room:
                location = room + ', ' + location

            if not status:
                status = api_event['status']

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event['EventId'])

            if web_event['Meeting video'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=web_event['Meeting video']['url'],
                                 type="recording",
                                 media_type='text/html')
            self.addDocs(e, web_event, 'Published agenda')
            self.addDocs(e, web_event, 'Notice')
            self.addDocs(e, web_event, 'Published summary')
            if 'Captions' in web_event:
                self.addDocs(e, web_event, 'Captions')

            participant = api_event["EventBodyName"]
            if participant == 'City Council':
                participant = 'Seattle City Council'
            # elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)':
            #     participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant, type="organization")

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()
            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'])

            for person in participants:
                e.add_participant(name=person, type="person")

            e.add_source(self.BASE_URL +
                         '/events/{EventId}'.format(**api_event),
                         note='api')

            e.add_source(web_event['Meeting Name']['url'], note='web')

            yield e
    def scrape(self):
        for event, web_event in self.events():

            body_name = event["EventBodyName"]
            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = ''

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            e.pupa_id = str(event['EventId'])

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(
                        item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event['Audio'] != 'Not\xa0available':

                redirect_url = self.head(
                    web_event['Audio']['url']).headers['Location']

                e.add_media_link(note=web_event['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
Beispiel #13
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(
                            item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for api_event, event in self.events(n_days_ago):

            description = api_event["EventComment"]
            when = api_event["start"]
            location = api_event["EventLocation"]

            if location == "Council Chambers":
                location = "Council Chambers, 5th Floor, City-County Building, " \
                            "414 Grant Street, Pittsburgh, PA 15219"

            if not location :
                continue

            status_string = api_event["status"]

            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text
                       for phrase in ("rescheduled to",
                                      "postponed to",
                                      "reconvened to",
                                      "rescheduled to",
                                      "meeting recessed",
                                      "recessed meeting",
                                      "postponed to",
                                      "recessed until",
                                      "deferred",
                                      "time change",
                                      "date change",
                                      "recessed meeting - reconvene",
                                      "cancelled",
                                      "new date and time",
                                      "rescheduled indefinitely",
                                      "rescheduled for",)) :
                    status = "cancelled"
                elif status_text in ("rescheduled", "recessed") :
                    status = "cancelled"
                elif status_text in ("meeting reconvened",
                                     "reconvened meeting",
                                     "recessed meeting",
                                     "reconvene meeting",
                                     "rescheduled hearing",
                                     "rescheduled meeting",) :
                    status = api_event["status"]
                elif status_text in ("amended notice of meeting",
                                     "room change",
                                     "amended notice",
                                     "change of location",
                                     "revised - meeting date and time") :
                    status = api_event["status"]
                elif "room" in status_text :
                    location = status_string[1] + ", " + location
                elif status_text in ("wrong meeting date",):
                    continue
                else :
                    print(status_text)
                    status = api_event["status"]
            else :
                status = api_event["status"]

            if event["Meeting Name"] == "Post Agenda":
                event_name = "Agenda Announcement"
            elif event["Meeting Name"] == "City Council":
                event_name = "Regular meeting"
            else:
                event_name = event["Meeting Name"]

            if description:
                e = Event(name=event_name,
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else:
                e = Event(name=event_name,
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event["EventId"])

            if event["Meeting video"] != "Not\xa0available":
                if "url" not in event["Meeting video"]:
                    pass
                else:
                    video_url = self.get_meeting_video_link(event["Meeting video"]["url"])
                    e.add_media_link(note="Recording",
                                     url=video_url,
                                     type="recording",
                                     media_type="text/html")

            self.addDocs(e, event, "Published agenda")
            self.addDocs(e, event, "Published minutes")

            participant = event["Meeting Name"]

            if participant == "City Council" or participant == "Post Agenda":
                participant = "Pittsburgh City Council"

            e.add_participant(name=participant,
                              type="organization")

            for item in self.agenda(api_event):
                clean_title = self.clean_agenda_item_title(item["EventItemTitle"])
                agenda_item = e.add_agenda_item(clean_title)
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)
                if item["EventItemVideo"] and event["Meeting video"] != "Not\xa0available":
                    item_video_url = self.get_meeting_video_link(event["Meeting video"]["url"]) + \
                                     '?view_id=2&meta_id=' + str(item["EventItemVideo"])

                    agenda_item.add_media_link(note="Recording",
                                               url=item_video_url,
                                               type="recording",
                                               media_type="text/html")

            participants = set()

            for call in self.rollcalls(api_event):
                if call["RollCallValueName"] == "Present":
                    participants.add(call["RollCallPersonName"])

            for person in participants:
                e.add_participant(name=person,
                                  type="person")

            e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event),
                         note="api")

            try:
                detail_url = event["Meeting Details"]["url"]
            except TypeError:
                e.add_source(self.EVENTSPAGE, note="web")
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note="web")

            yield e
Beispiel #15
0
    def scrape(self, follow_links=True):
        for page in self.eventPages(EVENTSPAGE):
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table) :
                if follow_links and type(events['Meeting\xa0Details']) == dict :
                    detail_url = events['Meeting\xa0Details']['url']
                    meeting_details = self.lxmlize(detail_url)

                    agenda_table = meeting_details.xpath(
                        "//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0]
                    agenda = self.parseDataTable(agenda_table)

                    
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(time_string,
                                                        "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1] :
                    status_text = status_string[1].lower()
                    if any(phrase in status_text 
                           for phrase in ('rescheduled to',
                                          'postponed to',
                                          'reconvened to',
                                          'recessed',
                                          'cancelled',
                                          'new date and time',
                                          'rescheduled indefinitely',
                                          'rescheduled for')) :
                        status = 'cancelled'
                    elif status_text in ('rescheduled') :
                        status = 'cancelled'
                    else :
                        print(status_text)
                elif datetime.datetime.utcnow().replace(tzinfo = pytz.utc) > when :
                    status = 'confirmed'
                else :
                    status = 'passed'
                            

                e = Event(name=events["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location=location,
                          status=status)
                e.add_source(detail_url)
                if events['Video'] != 'Not\xa0available' : 
                    e.add_media_link(note='Recording',
                                     url = events['Video']['url'],
                                     type="recording",
                                     media_type = 'text/html')

                addDocs(e, events, 'Agenda')
                addDocs(e, events, 'Notice')
                addDocs(e, events, 'Transcript')
                addDocs(e, events, 'Summary')

                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    agenda_item.add_bill(item["Record #"]['label'])

                
                e.add_participant(name=events["Name"]["label"],
                                  type="organization")

                yield e
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2011) :
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location'] :
                location_string, note = event[u'Meeting Location'].split('--em--')[:2]
                for each in note.split(' - ') :
                    if each.startswith('Join') :
                        other_orgs = each
                    else :
                        extras.append(each)
            else :
                location_string = event[u'Meeting Location'] 
            
            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred',) :
                status = 'cancelled'
            elif self.now() < when :
                status = 'confirmed'
            else :
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description 
                   for each 
                   in ('Multiple meeting items',
                       'AGENDA TO BE ANNOUNCED')) :
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events :
                continue
            else :
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_time=when,
                      timezone=self.TIMEZONE,
                      description=description,
                      location_name=location,
                      status=status)

            if extras :
                e.extras = {'location note' : ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Multimedia']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting' :
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower() :
                participating_orgs = [event["Name"]]
            else :
                participating_orgs = []

            if other_orgs : 
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)
 
            for org in participating_orgs :
                e.add_committee(name=org)

            if agenda :
                e.add_source(event["Meeting Details"]['url'])

                
                for item, _, _ in agenda :
                    if item["Name"] :
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"] :
                            if item['Action'] :
                                note = item['Action']
                            else :
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else :
                e.add_source(self.EVENTSPAGE)

            yield e
Beispiel #17
0
    def scrape(self):
        for event, agenda in self.events() :

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text 
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = confirmedOrPassed(when)
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = confirmedOrPassed(when)
            else :
                status = confirmedOrPassed(when)


            if description :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)


            if event['Video'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            participants = set()

            if agenda :
                e.add_source(event['Meeting Details']['url'], note='web')
                
                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"] :
                        identifier = item["Record #"]['label']
                        if identifier.startswith('S'):
                            identifier = identifier[1:]
                        agenda_item.add_bill(identifier)
                    elif ('label' in item['Action\xa0Details'] and
                          item['Action\xa0Details']['label'] == 'Roll\xa0call'):
                            
                        roll_call = self.extractRollCall(item['Action\xa0Details']['url'])
                        for attendance, person in roll_call:
                            if attendance == 'Present':
                                participants.add(person)

            for person in participants:
                e.add_participant(name=person,
                                  type="person")

            else :
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None
        for event, web_event in self.events(n_days_ago):

            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links
            e.extras = {'guid': event['EventGuid']}

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(
                        item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event['Audio'] != 'Not\xa0available':

                try:
                    redirect_url = self.head(
                        web_event['Audio']['url']).headers['Location']

                except KeyError:

                    # In some cases, the redirect URL does not yet contain the
                    # location of the audio file. Skip these events, and retry
                    # on next scrape.

                    continue

                e.add_media_link(note=web_event['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2017):
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location']:
                location_string, note = event[u'Meeting Location'].split(
                    '--em--')[:2]
                for each in note.split(' - '):
                    if each.startswith('Join'):
                        other_orgs = each
                    else:
                        extras.append(each)
            else:
                location_string = event[u'Meeting Location']

            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            response = self.get(event['iCalendar']['url'], verify=False)
            event_time = self.ical(
                response.text).subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred', ):
                status = 'cancelled'
            elif self.now() < when:
                status = 'confirmed'
            else:
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events:
                continue
            else:
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_date=when,
                      description=description,
                      location_name=location,
                      status=status)

            if extras:
                e.extras = {'location note': ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower():
                participating_orgs = [event["Name"]]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            if agenda:
                e.add_source(event["Meeting Details"]['url'], note='web')

                for item, _, _ in agenda:
                    if item["Name"]:
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"]:
                            if item['Action']:
                                note = item['Action']
                            else:
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else:
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
    def scrape(self):
        for event, agenda in self.events():

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1]:
                status_text = status_string[1].lower()
                if any(phrase in status_text for phrase in (
                        'rescheduled to',
                        'postponed to',
                        'reconvened to',
                        'rescheduled to',
                        'meeting recessed',
                        'recessed meeting',
                        'postponed to',
                        'recessed until',
                        'deferred',
                        'time change',
                        'date change',
                        'recessed meeting - reconvene',
                        'cancelled',
                        'new date and time',
                        'rescheduled indefinitely',
                        'rescheduled for',
                )):
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed'):
                    status = 'cancelled'
                elif status_text in (
                        'meeting reconvened',
                        'reconvened meeting',
                        'recessed meeting',
                        'reconvene meeting',
                        'rescheduled hearing',
                        'rescheduled meeting',
                ):
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change', 'amended notice',
                                     'change of location',
                                     'revised - meeting date and time'):
                    status = confirmedOrPassed(when)
                elif 'room' in status_text:
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date', ):
                    continue
                else:
                    print(status_text)
                    description = status_string[1]
                    status = confirmedOrPassed(when)
            else:
                status = confirmedOrPassed(when)

            if description:
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else:
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)

            if event['Video'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Video']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            e.add_participant(name=event["Name"]["label"], type="organization")

            if agenda:
                e.add_source(event['Name']['url'])

                for item, _, _ in agenda:
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"]:
                        agenda_item.add_bill(item["Record #"]['label'])

            else:
                e.add_source(self.EVENTSPAGE)

            yield e
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)

        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' %
                                                   meeting_type):

                # attempt to map the cells across table types.
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(
                        meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(
                    int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title,
                          when=meeting_date,
                          location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a')) > 0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect(
                        'a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda',
                                   url=meeting_agenda_url,
                                   mimetype='text/html')

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(
                        meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href', '')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text,
                                               url=link_url,
                                               mimetype='application/pdf')

                # skip everything below here for the 'upcoming' table
                if meeting_type == 'upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell) > 0:
                    video_url_match = re.search(
                        r"http://(.*?)'",
                        video_cell[0].attrib.get('onclick', ''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video",
                                         url=video_url_match.group(0),
                                         mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell) > 0:
                    e.add_media_link(name="Audio",
                                     url=audio_cell[0].attrib.get('href', ''),
                                     mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell) > 0:
                    e.add_media_link(name="Minutes",
                                     url=minutes_cell[0].attrib.get(
                                         'href', ''),
                                     mimetype='text/html')

                yield e
Beispiel #23
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name,
                              type="organization")

            e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']),
                         note='api')

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for api_event, event in self.events(n_days_ago):

            description = None

            when = api_event['start']
            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = api_event['status']
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = api_event['status']
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = api_event['status']
            else :
                status = api_event['status']


            if description :
                e = Event(name=event["Name"]["label"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_date=when,
                          location_name=location,
                          status=status)

            e.pupa_id = str(api_event['EventId'])

            if event['Video'] != 'Not\xa0available' :
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Captions')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()
            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'])

            for person in participants:
                e.add_participant(name=person,
                                  type="person")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event), 
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e