Beispiel #1
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Beispiel #2
0
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
Beispiel #3
0
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search('\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
Beispiel #4
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source(
            'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(description,
                               item.xpath('@href')[0],
                               media_type="application/pdf",
                               on_duplicate="ignore")

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link('Video of Hearing',
                                 video[0].xpath('@href')[0], 'text/html')

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Beispiel #5
0
    def scrape(self, chamber=None):
        URL = "http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas"
        doc = self.lxmlize(URL)
        events = doc.xpath("//item")

        for info in events:
            title_and_date = info.xpath("title/text()")[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(
                name=title,
                start_date=self._tz.localize(
                    datetime.datetime.strptime(when, "%b %d, %Y")),
                location_name="State Capitol",
            )
            event.add_source(URL)

            url = re.search(r"(http://.*?)\s", info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name,
                                      type="committee",
                                      note="host")

            documents = doc.xpath(".//td")
            for document in documents:
                url = re.search(r"(http://.*?pdf)",
                                document.xpath("@onclick")[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                    note=document.xpath("text()")[0],
                    url=url,
                    media_type="application/pdf",
                )
                bills = document.xpath("@onclick")
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item("Bill up for discussion")
                        item.add_bill(bill_name)
            yield event
Beispiel #6
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr,
                          start_time=when,
                          classification='committee-meeting',
                          description=descr,
                          location_name=where,
                          timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {
                        "chamber": "unknown",
                        "name": committee
                    })

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(bill['bill_id'], note=bill['type'])
            yield event
Beispiel #7
0
    def scrape(self):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            name = " ".join(
                x.strip()
                for x in doc.xpath('//div[@class="schedule"]//text()')
                if x.strip()
            )

            # Skip events with no name
            if not name:
                continue

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        info.xpath('span[@class="col02"]/text()')[0],
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=doc.xpath(
                    '//div[@class="heading-container"]/span/text()'
                )[0].title()
            )

            event.add_participant(
                info.xpath('span[@class="col01"]/text()')[0].title(),
                type='committee',
                note='host',
            )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                    document.xpath('text()')[0],
                    url=document.xpath('@href')[0]
                )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            yield event
Beispiel #8
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Beispiel #9
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Beispiel #10
0
    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(datetime.datetime.strptime(when,
                                                                                  '%b %d, %Y')),
                          location_name='State Capitol'
                          )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name, type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        note=document.xpath('text()')[0],
                        url=url,
                        media_type='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event
Beispiel #11
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event
Beispiel #12
0
    def scrape(self):
        for event in self.events():
            e = Event(name=event["EventBodyName"],
                      start_time=event["start"],
                      timezone=self.TIMEZONE,
                      description='',
                      location_name=event["EventLocation"],
                      status=event["status"])
            
            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)


            e.add_participant(name=event["EventBodyName"],
                              type="organization")

            e.add_source('foo')

            meeting_detail_web = self.WEB_URL + '/MeetingDetail.aspx?ID={EventId}&GUID={EventGuid}'.format(**event)
            if requests.head(meeting_detail_web).status_code == 200:
                e.add_source(meeting_detail_web, note='web')
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")


            yield e
Beispiel #13
0
    def scrape_event(self, row):
        date_td = row.xpath('td[1]')[0]
        info_td = row.xpath('td[2]')[0]

        date = date_td.xpath('b')[0].text.strip()
        time = date_td.xpath('b/following-sibling::text()')[0].strip()

        date_and_time = "{} {}".format(date, time)
        start_date = datetime.datetime.strptime(
            date_and_time, '%m/%d/%y %I:%M %p')

        title = info_td.xpath('font[1]/strong')[0].text.strip()

        all_text = info_td.xpath('descendant-or-self::*/text()')
        notes = (line.strip() for line in all_text if line.strip())
        notes = list(notes)
        # Skip the first line, which is the title
        notes = notes[1:]
        # Split out the address
        address = notes[0]
        notes = notes[1:]
        # The rest just becomes the description
        notes = "\n".join(notes)

        event = Event(
            start_date=self._TZ.localize(start_date),
            name=title,
            location_name=address,
            description=notes
        )

        event.add_source(self.URL)

        if info_td.xpath('a[contains(font/text(),"agenda")]'):
            agenda_url = info_td.xpath('a/@href')[0]
            event.add_document(
                "Agenda",
                url=agenda_url
            )

        yield event
Beispiel #14
0
    def scrape(self, chamber=None, session=None):
        """
        Scrape the events data from all dates from the sc meetings page,
        then create and yield the events objects from the data.
        :param chamber:
        :param session:
        :return: yielded Event objects
        """

        chambers = {
            'upper': {
                'name': 'Senate',
                'title': 'Senator'
            },
            'lower': {
                'name': 'House',
                'title': 'Representative'
            },
        }
        if chamber == 'other':
            return

        if chamber is None:
            self.info(
                'no chamber specified, using Joint Committee Meeting Schedule')
            events_url = 'http://www.scstatehouse.gov/meetings.php'
        else:
            events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % (
                chambers[chamber]['name'].upper()[0])

        page = self.get_page_from_url(events_url)

        meeting_year = page.xpath(
            '//h2[@class="barheader"]/span')[0].text_content()
        meeting_year = re.search(
            r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})',
            meeting_year).group(1)

        dates = page.xpath("//div[@id='contentsection']/ul")

        for date in dates:
            date_string = date.xpath('span')

            if len(date_string) == 1:
                date_string = date_string[0].text_content()
            else:
                continue

            # If a event is in the next calendar year, the date_string
            # will have a year in it
            if date_string.count(",") == 2:
                event_year = date_string[-4:]
                date_string = date_string[:-6]
            elif date_string.count(",") == 1:
                event_year = meeting_year
            else:
                raise AssertionError("This is not a valid date: '{}'"). \
                    format(date_string)

            for meeting in date.xpath('li'):
                time_string = meeting.xpath('span')[0].text_content()

                if time_string == 'CANCELED' or len(
                        meeting.xpath(
                            './/span[contains(text(), "CANCELED")]')) > 0:
                    continue

                time_string = normalize_time(time_string)
                date_time = datetime.datetime.strptime(
                    event_year + ' ' + date_string + ' ' + time_string,
                    "%Y %A, %B %d %I:%M %p")

                date_time = self._tz.localize(date_time)
                meeting_info = meeting.xpath(
                    'br[1]/preceding-sibling::node()')[1]
                location, description = re.search(r'-- (.*?) -- (.*)',
                                                  meeting_info).groups()

                # if re.search(r'committee', description, re.I):
                #     meeting_type = 'committee:meeting'
                # else:
                #     meeting_type = 'other:meeting'

                event = Event(
                    name=description,  # Event Name
                    start_time=date_time,  # When the event will take place
                    timezone=self._tz.zone,  # the local timezone for the event
                    location_name=location)  # Where the event will be

                event.add_source(events_url)

                agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]")

                if agenda_url:
                    agenda_url = agenda_url[0].attrib['href']
                    event.add_source(agenda_url)
                    event.add_document(note="Agenda",
                                       url=agenda_url,
                                       media_type="application/pdf")

                    agenda_page = self.get_page_from_url(agenda_url)

                    for bill in agenda_page.xpath(
                            ".//a[contains(@href,'billsearch.php')]"):
                        # bill_url = bill.attrib['href']
                        bill_id = bill.text_content().replace('.', '').replace(
                            ' ', '')
                        # bill_description = self.get_bill_description(bill_url)

                        event.add_bill(bill_id)

                yield event
Beispiel #15
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(since_datetime=n_days_ago)

        service_councils = set(sc['BodyId'] for sc in self.search(
            '/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75'))

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            elif event['EventBodyId'] in service_councils:
                # Don't scrape service council or service council public hearing events.
                self.info('Skipping event {0} for {1}'.format(
                    event['EventId'], event['EventBodyName']))
                continue
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if web_event.has_ecomment:
                self.info('Adding eComment link {0} from {1}'.format(
                    web_event['eComment'],
                    web_event['Meeting Details']['url']))
                e.extras['ecomment'] = web_event['eComment']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        agenda_number = item["EventItemAgendaNumber"]
                        note = "Agenda number, {}".format(agenda_number)
                        agenda_item['notes'].append(note)

                        agenda_item['extras']['agenda_number'] = agenda_number

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(
                    note='Agenda',
                    url=event['EventAgendaFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventAgendaLastPublishedUTC']).date())

            if event['EventMinutesFile']:
                e.add_document(
                    note='Minutes',
                    url=event['EventMinutesFile'],
                    media_type="application/pdf",
                    date=self.to_utc_timestamp(
                        event['EventMinutesLastPublishedUTC']).date())
            elif web_event['Published minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Published minutes']['label'],
                               url=web_event['Published minutes']['url'],
                               media_type="application/pdf")
            else:
                approved_minutes = self.find_approved_minutes(event)
                if approved_minutes:
                    e.add_document(
                        note=approved_minutes['MatterAttachmentName'],
                        url=approved_minutes['MatterAttachmentHyperlink'],
                        media_type="application/pdf",
                        date=self.to_utc_timestamp(
                            approved_minutes['MatterAttachmentLastModifiedUtc']
                        ).date())

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                # Sometimes if there is an issue getting the Spanish
                # audio created, Metro has the Spanish Audio link
                # go to the English Audio.
                #
                # Pupa does not allow the for duplicate media links,
                # so we'll ignore the the second media link if it's
                # the same as the first media link.
                #
                # Because of the way that the event['audio'] is created
                # the first audio link is always English and the
                # second is always Spanish
                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html',
                                 on_duplicate='ignore')

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None
        for event, web_event in self.events(n_days_ago):

            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links
            e.extras = {'guid': event['EventGuid']}

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(
                        item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event['Audio'] != 'Not\xa0available':

                try:
                    redirect_url = self.head(
                        web_event['Audio']['url']).headers['Location']

                except KeyError:

                    # In some cases, the redirect URL does not yet contain the
                    # location of the audio file. Skip these events, and retry
                    # on next scrape.

                    continue

                e.add_media_link(note=web_event['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
Beispiel #18
0
    def scrape(self, session=None, start=None, end=None):

        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        # testimony url, we'll need it later in a loop

        # testmony query looks gnary but breaks down to:
        # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129)
        # $orderby: LastName,FirstName,Organization
        # $expand: Request
        # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization,
        # PresentedDate,FileSize,Topic

        testimony_url_base = (
            "http://legislature.maine.gov/backend/"
            "breeze/data/CommitteeTestimony?"
            "$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and"
            "%20(Request%2FLegislature%20eq%20{})"
            "&$orderby=LastName%2CFirstName%2COrganization&"
            "$expand=Request&$select=Id%2CFileType%2CNamePrefix"
            "%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic"
        )

        if start is None:
            start_date = datetime.datetime.now().isoformat()
        else:
            start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
            start_date = start_date.isoformat()

        # default to 30 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=30)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.isoformat()
        else:
            end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
            end_date = end_date.isoformat()

        bills_by_event = {}

        bills_url = ("http://legislature.maine.gov/backend/breeze/data/"
                     "getCalendarEventsBills?startDate={}&endDate={}")
        bills_url = bills_url.format(start_date, end_date)
        page = json.loads(self.get(bills_url).content)

        for row in page:
            bills_by_event.setdefault(row["EventId"], [])
            bills_by_event[row["EventId"]].append(row)

        # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false
        url = ("http://legislature.maine.gov/backend/breeze/data/"
               "getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true")
        url = url.format(start_date, end_date)

        page = json.loads(self.get(url).content)

        for row in page:
            if row["Cancelled"] is True or row["Postponed"] is True:
                continue

            start_date = self._TZ.localize(
                dateutil.parser.parse(row["FromDateTime"]))
            end_date = self._TZ.localize(
                dateutil.parser.parse(row["ToDateTime"]))

            name = row["CommitteeName"]

            if name is None:
                name = row["Host"]

            address = row["Location"]
            address = address.replace(
                "Cross Building",
                "Cross Office Building, 111 Sewall St, Augusta, ME 04330",
            )

            address = address.replace(
                "State House",
                "Maine State House, 210 State St, Augusta, ME 04330")

            event = Event(
                start_date=start_date,
                end_date=end_date,
                name=name,
                location_name=address,
            )

            event.add_source(
                "http://legislature.maine.gov/committee/#Committees/{}".format(
                    row["CommitteeCode"]))

            if bills_by_event.get(row["Id"]):
                for bill in bills_by_event[row["Id"]]:
                    description = "LD {}: {}".format(bill["LD"], bill["Title"])
                    agenda = event.add_agenda_item(description=description)
                    agenda.add_bill("LD {}".format(bill["LD"]))

                    if bill["TestimonyCount"] > 0:
                        test_url = testimony_url_base.format(
                            bill["PaperNumber"], session)
                        test_page = json.loads(self.get(test_url).content)
                        for test in test_page:
                            title = "{} {} - {}".format(
                                test["FirstName"],
                                test["LastName"],
                                test["Organization"],
                            )
                            if test["NamePrefix"] is not None:
                                title = "{} {}".format(test["NamePrefix"],
                                                       title)

                            test_url = (
                                "http://legislature.maine.gov/backend/app/services"
                                "/getDocument.aspx?doctype=test&documentId={}".
                                format(test["Id"]))

                            if test["FileType"] == "pdf":
                                media_type = "application/pdf"

                            event.add_document(note=title,
                                               url=test_url,
                                               media_type=media_type)
            yield event
Beispiel #19
0
    def scrape(self, start_time=None):

        if start_time is None:
            start_time = datetime.datetime(2017, 1, 1, 0, 0, tzinfo=pytz.utc)

        dupes = {}
        uniq = {}
        bad_ids = []

        for i, hearing in enumerate(self.congressional_hearings(start_time)):
            package_id = hearing['packageId']
            try:
                package_num, = re.findall('\d+$', package_id)
            except ValueError:
                bad_ids.append(package_id)
                continue
            # For appropriations hearings, the committees tend to
            # publish portions of the hearings as they are completed,
            # and then the final hearing are usually compiled,
            # printed, and added to the repository at the request of
            # the Committee.
            #
            # packages with 8 digits after hrg are the in-process
            # version
            #
            # There could be some time between the in-process and
            # final packages. Publication of hearings is the purview
            # of the committee.
            #
            # https://github.com/usgpo/api/issues/21#issuecomment-435926223
            if len(package_num) == 8:
                continue

            mods_link = hearing['download']['modsLink']
            response = self.get(mods_link)
            mods = xmltodict.parse(response.content)
            extension = collections.ChainMap(*mods['mods']['extension'])

            granule_class = extension.get('granuleClass', 'boo')
            if granule_class == 'ERRATA':
                continue

            meeting_type = self._meeting_type(extension)
            if meeting_type is None:
                continue

            held_date = extension['heldDate']
            if type(held_date) is list:
                start_date = min(held_date)
            else:
                start_date = held_date

            event = Event(name=self._title(mods),
                          start_date=start_date,
                          classification=meeting_type,
                          location_name='unknown')
            if not event.name:
                continue

            if 'number' in extension:
                hearing_number = '{docClass} {congress}-{number}'.format(
                    **extension)
                print(hearing_number)
                event.extras['hearing_number'] = hearing_number

            for committee_d in self._unique(extension.get('congCommittee',
                                                          [])):
                names = committee_d['name']
                committee_name = self._name_type(names, 'authority-standard')
                if committee_name is None:
                    committee_name = self._name_type(names, 'authority-short')

                if committee_d['@chamber'] == 'H':
                    committee_name = 'House ' + committee_name
                elif committee_d['@chamber'] == 'S':
                    committee_name = 'Senate ' + committee_name

                try:
                    thomas_id = committee_d['@authorityId'].upper()
                except KeyError:
                    thomas_id = None

                sub_committees = self._subcommittees(committee_d)
                if sub_committees:
                    for sub_committee_d in sub_committees:
                        sub_committee_name = sub_committee_d['name']['#text']
                        sub_committee_name = sub_committee_name.strip(
                            string.punctuation)
                        sub_committee_id = _make_pseudo_id(
                            name=sub_committee_name,
                            parent__identifiers__identifier=thomas_id)
                        ret = {
                            "name": sub_committee_name,
                            "entity_type": 'organization',
                            "note": 'host',
                            "organization_id": sub_committee_id,
                        }
                        event.participants.append(ret)

                else:
                    if thomas_id:
                        ret = {
                            "name":
                            committee_name,
                            "entity_type":
                            'organization',
                            "note":
                            'host',
                            "organization_id":
                            _make_pseudo_id(identifiers__identifier=thomas_id)
                        }
                        event.participants.append(ret)
                    else:
                        event.add_committee(committee_name, note='host')

            links = mods['mods']['location']['url']
            for link in self._unique(links):
                if link['@displayLabel'] == 'Content Detail':
                    event.add_source(link['#text'], note='web')
                elif link['@displayLabel'] == 'HTML rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='text/html')
                elif link['@displayLabel'] == 'PDF rendition':
                    event.add_document('transcript',
                                       link['#text'],
                                       media_type='application/pdf')

            event.add_source(mods_link, note='API')

            self._unique_event(uniq, event, dupes)

        self._house_docs(uniq)

        for event in uniq.values():
            yield event

        with open('bad_ids.txt', 'w') as f:
            for id in bad_ids:
                f.write(id + '\n')
    def _parse_house_floor_xml_legislative_activity(self, xml):
        """
        Parses XML string of House floor updates and yields them in loop.

        @param xml: XML of field update
        @type xml: string
        @return: complete Event object
        @rtype: Event
        """
        tree = self._xml_parser(xml)

        congress = tree.xpath('.//legislative_congress')[0].get('congress')

        house_committees = self._get_current_house_committee_names()
        for fa in tree.xpath('.//floor_action'):
            fa_text = fa.xpath('.//action_description')[0].xpath('string()')

            eastern = pytz.timezone('US/Eastern')
            dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S')
            event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')),
                          eastern.localize(dt).astimezone(pytz.utc),
                          'US/Eastern',
                          '',
                          description=fa_text,
                          classification='floor_update')

            event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004",
                               note='House Floor', url='http://www.house.gov',
                               coordinates={'latitude': '38.889931', 'longitude': '-77.009003'})

            event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')),
                             note="Scraped from the Office of the Clerk, U.S. House of Representatives website.")

            event.extras['act-id'] = fa.get('act-id')
            event.extras['unique-id'] = fa.get('unique-id')

            # bills
            ai_b = event.add_agenda_item(description='Bills referenced by this update.')
            for bill in fa.xpath(".//a[@rel='bill']"):
                bill_name = bill.xpath('string()')
                ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress),
                              note="Bill was referenced on the House floor.")

            # publaws
            ai_p = event.add_agenda_item(description='Public laws referenced by this update.')
            for law in fa.xpath(".//a[@rel='publaw']"):
                detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html'
                ai_p.add_bill(law.xpath('string()'),
                              id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)),
                              note='Law was referenced on the House floor.')

            # votes
            ai_v = event.add_agenda_item(description='Votes referenced by this update.')
            for vote in fa.xpath(".//a[@rel='vote']"):
                vote_name = vote.xpath('string()')
                ai_v.add_vote(vote_name,
                              id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress),
                              note='Vote was referenced on the House floor.')

            # reports
            for report in fa.xpath(".//a[@rel='report']"):
                event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html')

            for name in house_committees:
                if name.replace('House ', '') in fa_text:
                    event.add_committee(name, id=make_pseudo_id(name=name))

            # TODO identify legislators and add them as participants?


            yield event
Beispiel #21
0
    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf["DATE:"]
        time = metainf["TIME:"]
        where = metainf["PLACE:"]

        # check for duration in time
        if " - " in time:
            start, end = time.split(" - ")
            am_pm_srch = re.search("(?i)(am|pm)", end)
            if am_pm_srch:
                time = " ".join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M"
        ]

        event_desc = "Meeting Notice"
        if "Rise" in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(name=event_desc,
                      start_date=self._tz.localize(datetime),
                      location_name=where)
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib["href"]
            event.add_document(bill.text_content(),
                               bill_ft,
                               media_type="application/pdf")
            root = bill.xpath("../../*")
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = (bill.getparent().getparent().getparent().getnext().
                     getnext().text_content())

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, "committee", note="host")

        yield event
Beispiel #22
0
    def scrape(self):
        web_results = self.scrapeWebCalendar()

        for event in self.events():
            # Create a key for lookups in the web_results dict.
            key = (event['EventBodyName'].strip(),
                   self.toTime(event['EventDate']).date(), event['EventTime'])

            web_event_dict = web_results.get(
                key, {
                    'Meeting Details': 'Meeting\xa0details',
                    'Audio': 'Not\xa0available',
                    'Recap/Minutes': 'Not\xa0available'
                })

            body_name = event["EventBodyName"]
            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = ''

            e = Event(event_name,
                      start_time=event["start"],
                      timezone=self.TIMEZONE,
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event_dict['Audio'] != 'Not\xa0available':

                redirect_url = self.head(
                    web_event_dict['Audio']['url']).headers['Location']

                e.add_media_link(note=web_event_dict['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event_dict['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event_dict['Recap/Minutes']['label'],
                               url=web_event_dict['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event_dict['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event_dict['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event_dict['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
Beispiel #23
0
    def scrape(self):

        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()

            if self.short_ids.get(committee):
                descr = "{} {}".format(
                    self.chambers[self.short_ids[committee]["chamber"]],
                    self.short_ids[committee]["name"],
                )
            else:
                descr = [x.text_content() for x in tds[1].xpath(".//span")]
                if len(descr) != 1:
                    raise Exception
                descr = descr[0].replace(".", "").strip()

            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib["href"]
            notice_name = notice.text

            # the listing page shows the same hearing in multiple rows.
            # combine these -- get_related_bills() will take care of adding the bills
            # and descriptions
            if notice_href in self.seen_hearings:
                continue
            else:
                self.seen_hearings.append(notice_href)

            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = TIMEZONE.localize(when)
            event = Event(
                name=descr,
                start_date=when,
                classification="committee-meeting",
                description=descr,
                location_name=where,
            )

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee and committee in self.short_ids:
                    committee = "{} {}".format(
                        self.chambers[self.short_ids[committee]["chamber"]],
                        self.short_ids[committee]["name"],
                    )
                event.add_committee(committee, note="host")

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type="text/html")
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill["descr"].strip())
                a.add_bill(bill["bill_id"], note=bill["type"])
            yield event
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)

        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' %
                                                   meeting_type):

                # attempt to map the cells across table types.
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(
                        meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(
                    int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title,
                          when=meeting_date,
                          location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a')) > 0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect(
                        'a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda',
                                   url=meeting_agenda_url,
                                   mimetype='text/html')

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(
                        meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href', '')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text,
                                               url=link_url,
                                               mimetype='application/pdf')

                # skip everything below here for the 'upcoming' table
                if meeting_type == 'upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell) > 0:
                    video_url_match = re.search(
                        r"http://(.*?)'",
                        video_cell[0].attrib.get('onclick', ''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video",
                                         url=video_url_match.group(0),
                                         mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell) > 0:
                    e.add_media_link(name="Audio",
                                     url=audio_cell[0].attrib.get('href', ''),
                                     mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell) > 0:
                    e.add_media_link(name="Minutes",
                                     url=minutes_cell[0].attrib.get(
                                         'href', ''),
                                     mimetype='text/html')

                yield e
Beispiel #25
0
    def scrape(self, session=None, start=None, end=None):

        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        # testimony url, we'll need it later in a loop

        # testmony query looks gnary but breaks down to:
        # $filter: (Request/PaperNumber eq 'SP0219') and (Request/Legislature eq 129)
        # $orderby: LastName,FirstName,Organization
        # $expand: Request
        # $select: Id,FileType,NamePrefix,FirstName,LastName,Organization,
        # PresentedDate,FileSize,Topic

        testimony_url_base = 'http://legislature.maine.gov/backend/' \
            'breeze/data/CommitteeTestimony?' \
            '$filter=(Request%2FPaperNumber%20eq%20%27{}%27)%20and' \
            '%20(Request%2FLegislature%20eq%20{})' \
            '&$orderby=LastName%2CFirstName%2COrganization&' \
            '$expand=Request&$select=Id%2CFileType%2CNamePrefix' \
            '%2CFirstName%2CLastName%2COrganization%2CPresentedDate%2CFileSize%2CTopic'

        if start is None:
            start_date = datetime.datetime.now().isoformat()
        else:
            start_date = datetime.datetime.strptime(start, "%Y-%m-%d")
            start_date = start_date.isoformat()

        # default to 30 days if no end
        if end is None:
            dtdelta = datetime.timedelta(days=30)
            end_date = datetime.datetime.now() + dtdelta
            end_date = end_date.isoformat()
        else:
            end_date = datetime.datetime.strptime(end, "%Y-%m-%d")
            end_date = end_date.isoformat()

        bills_by_event = {}

        bills_url = 'http://legislature.maine.gov/backend/breeze/data/' \
            'getCalendarEventsBills?startDate={}&endDate={}'
        bills_url = bills_url.format(start_date, end_date)
        page = json.loads(self.get(bills_url).content)

        for row in page:
            bills_by_event.setdefault(row['EventId'], [])
            bills_by_event[row['EventId']].append(row)

        # http://legislature.maine.gov/backend/breeze/data/getCalendarEventsRaw?startDate=2019-03-01T05%3A00%3A00.000Z&endDate=2019-04-01T03%3A59%3A59.999Z&OnlyPHWS=false
        url = 'http://legislature.maine.gov/backend/breeze/data/' \
            'getCalendarEventsRaw?startDate={}&endDate={}&OnlyPHWS=true'
        url = url.format(start_date, end_date)

        page = json.loads(self.get(url).content)

        for row in page:
            if row['Cancelled'] is True or row['Postponed'] is True:
                continue

            start_date = self._TZ.localize(
                dateutil.parser.parse(
                    row['FromDateTime']
                )
            )
            end_date = self._TZ.localize(
                dateutil.parser.parse(
                    row['ToDateTime']
                )
            )

            name = row['CommitteeName']

            if name is None:
                name = row['Host']

            address = row['Location']
            address = address.replace(
                'Cross Building',
                'Cross Office Building, 111 Sewall St, Augusta, ME 04330')

            address = address.replace(
                'State House',
                'Maine State House, 210 State St, Augusta, ME 04330')

            event = Event(
                start_date=start_date,
                end_date=end_date,
                name=name,
                location_name=address,
            )

            event.add_source(
                'http://legislature.maine.gov/committee/#Committees/{}'
                .format(row['CommitteeCode'])
            )

            if bills_by_event.get(row['Id']):
                for bill in bills_by_event[row['Id']]:
                    description = 'LD {}: {}'.format(bill['LD'], bill['Title'])
                    agenda = event.add_agenda_item(description=description)
                    agenda.add_bill('LD {}'.format(bill['LD']))

                    if bill['TestimonyCount'] > 0:
                        test_url = testimony_url_base.format(bill['PaperNumber'], session)
                        test_page = json.loads(self.get(test_url).content)
                        for test in test_page:
                            title = '{} {} - {}'.format(
                                test['FirstName'],
                                test['LastName'],
                                test['Organization']
                            )
                            if test['NamePrefix'] is not None:
                                title = '{} {}'.format(test['NamePrefix'], title)

                            test_url = 'http://legislature.maine.gov/backend/app/services' \
                                '/getDocument.aspx?doctype=test&documentId={}'.format(test['Id'])

                            if test['FileType'] == 'pdf':
                                media_type = "application/pdf"

                            event.add_document(
                                note=title,
                                url=test_url,
                                media_type=media_type
                            )
            yield event
Beispiel #26
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(
                            item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            begin = self._date_parse(event.pop('when'))
            end = self._date_parse(event.pop('end'))
            all_day = event.pop('all_day',False)

            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location_name=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=begin,
                      end_time=end,
                      all_day=all_day,)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e
Beispiel #28
0
    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=self._date_parse(event.pop('when')),
                      end_time=self._date_parse(event.pop('end')),)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e
Beispiel #29
0
    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']

        # check for duration in time
        if ' - ' in time:
            start, end = time.split(' - ')
            am_pm_srch = re.search('(?i)(am|pm)', end)
            if am_pm_srch:
                time = ' '.join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        event_desc = "Meeting Notice"
        if 'Rise' in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(
            name=event_desc,
            start_date=self._tz.localize(datetime),
            location_name=where,
        )
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(
                bill.text_content(), bill_ft,
                media_type="application/pdf")
            root = bill.xpath('../../*')
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext().text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, 'committee', note='host')

        yield event
    def scrape(self):
        for event, web_event in self.events():

            body_name = event["EventBodyName"]
            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = ''

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=event["EventLocation"],
                      status=status)

            e.pupa_id = str(event['EventId'])

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(
                        item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name, type="organization")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            # Update 'e' with data from https://metro.legistar.com/Calendar.aspx, if that data exists.
            if web_event['Audio'] != 'Not\xa0available':

                redirect_url = self.head(
                    web_event['Audio']['url']).headers['Location']

                e.add_media_link(note=web_event['Audio']['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if web_event['Meeting Details'] != 'Meeting\xa0details':
                if requests.head(web_event['Meeting Details']
                                 ['url']).status_code == 200:
                    e.add_source(web_event['Meeting Details']['url'],
                                 note='web')
                else:
                    e.add_source('https://metro.legistar.com/Calendar.aspx',
                                 note='web')

            yield e
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e
Beispiel #32
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

                if item["EventItemAgendaNumber"]:
                    # To the notes field, add the item number as given in the agenda minutes
                    note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                    agenda_item['notes'].append(note)

            e.add_participant(name=body_name,
                              type="organization")

            e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']),
                         note='api')

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
Beispiel #33
0
    def scrape(self, chamber=None, session=None):
        """
        Scrape the events data from all dates from the sc meetings page,
        then create and yield the events objects from the data.
        :param chamber:
        :param session:
        :return: yielded Event objects
        """

        chambers = {
            'upper': {'name': 'Senate', 'title': 'Senator'},
            'lower': {'name': 'House', 'title': 'Representative'},
        }
        if chamber == 'other':
            return

        if chamber is None:
            self.info('no chamber specified, using Joint Committee Meeting Schedule')
            events_url = 'http://www.scstatehouse.gov/meetings.php'
        else:
            events_url = 'http://www.scstatehouse.gov/meetings.php?chamber=%s' % (
                chambers[chamber]['name'].upper()[0]
            )

        page = self.get_page_from_url(events_url)

        meeting_year = page.xpath(
            '//h2[@class="barheader"]/span')[0].text_content()
        meeting_year = re.search(
            r'Week of [A-Z][a-z]+\s+[0-9]{1,2}, ([0-9]{4})',
            meeting_year).group(1)

        dates = page.xpath("//div[@id='contentsection']/ul")

        for date in dates:
            date_string = date.xpath('span')

            if len(date_string) == 1:
                date_string = date_string[0].text_content()
            else:
                continue

            # If a event is in the next calendar year, the date_string
            # will have a year in it
            if date_string.count(",") == 2:
                event_year = date_string[-4:]
                date_string = date_string[:-6]
            elif date_string.count(",") == 1:
                event_year = meeting_year
            else:
                raise AssertionError("This is not a valid date: '{}'"). \
                    format(date_string)

            for meeting in date.xpath('li'):
                time_string = meeting.xpath('span')[0].text_content()

                if time_string == 'CANCELED' or len(
                        meeting.xpath(
                            './/span[contains(text(), "CANCELED")]')) > 0:
                    continue

                time_string = normalize_time(time_string)
                date_time = datetime.datetime.strptime(
                    event_year + ' ' + date_string
                    + ' ' + time_string, "%Y %A, %B %d %I:%M %p")

                date_time = self._tz.localize(date_time)
                meeting_info = meeting.xpath(
                    'br[1]/preceding-sibling::node()')[1]
                location, description = re.search(
                    r'-- (.*?) -- (.*)', meeting_info).groups()

                # if re.search(r'committee', description, re.I):
                #     meeting_type = 'committee:meeting'
                # else:
                #     meeting_type = 'other:meeting'

                event = Event(name=description,  # Event Name
                              start_date=date_time,  # When the event will take place
                              location_name=location)  # Where the event will be

                event.add_source(events_url)

                agenda_url = meeting.xpath(".//a[contains(@href,'agendas')]")

                if agenda_url:
                    agenda_url = agenda_url[0].attrib['href']
                    event.add_source(agenda_url)
                    event.add_document(note="Agenda",
                                       url=agenda_url,
                                       media_type="application/pdf")

                    agenda_page = self.get_page_from_url(agenda_url)

                    for bill in agenda_page.xpath(
                            ".//a[contains(@href,'billsearch.php')]"):
                        # bill_url = bill.attrib['href']
                        bill_id = bill.text_content().replace(
                            '.', '').replace(' ', '')
                        # bill_description = self.get_bill_description(bill_url)

                        event.add_bill(bill_id)

                yield event