Example #1
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Example #2
0
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
Example #3
0
    def scrape_event_page(self, url, event_type):
        page = self.lxmlize(url)
        page.make_links_absolute('https://malegislature.gov/')

        title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
        title = title.replace('Hearing Details', '').strip()
        title = title.replace('Special Event Details', '')

        start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip()
        start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()

        location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip()

        description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()

        start_date = self._TZ.localize(
            dateutil.parser.parse(
                '{} {}'.format(start_day, start_time),
            )
        )

        event = Event(
            start_date=start_date,
            name=title,
            location_name=location,
            description=description
        )

        event.add_source(url)

        agenda_rows = page.xpath(
            '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
            '/div/div/div[contains(@class,"panel-default")]')

        for row in agenda_rows:
            # only select the text node, not the spans
            agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip()

            if agenda_title == '':
                agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip()

            agenda = event.add_agenda_item(description=agenda_title)

            bills = row.xpath('.//tbody/tr/td[1]/a/text()')
            for bill in bills:
                bill = bill.strip().replace('.', ' ')
                agenda.add_bill(bill)

        if event_type == 'Hearing':
            event.add_participant(
                title,
                type='committee',
                note='host',
            )

        yield event
Example #4
0
    def scrape(self):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            # when is now of the following format:
            # Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            name = description_lines[0].text_content()
            desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
            description = re.sub(r'\s+', " ", description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'

            e = Event(name=name,
                      description=description,
                      start_date=self._tz.localize(when),
                      location_name=location,
                      classification=event_type,
                      )

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill, "").strip(", ").strip()
                ses, num = bill.split("-")
                bill = ses.replace(" ", "") + "-" + num.zfill(4)
                item = e.add_agenda_item(bill_desc)
                item.add_bill(bill)

            e.add_source(calendar_url)

            if committee:
                e.add_participant(committee, type='organization', note='host')

            yield e
Example #5
0
    def scrape_upper(self):
        listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split('<hr />')[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, 'Date:')
            when_time = self.row_content(page, 'Time:')
            location = self.row_content(page, 'Room:')

            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

            # com = self.row_content(page, 'Committee:')
            com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0]
            com = com.split(', Senator')[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse('{} {}'.format(when_date, when_time))
            )

            event = Event(
                start_date=start_date,
                name=com,
                location_name=location
            )

            event.add_source(listing_url)

            event.add_participant(
                com,
                type='committee',
                note='host',
            )

            for bill_table in page.xpath('//table[@width="85%" and @border="0"]'):
                bill_link = ''
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath('string(tr[2])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

                    bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath('string(tr[1])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

            yield event
Example #6
0
    def scrape(self):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            name = " ".join(
                x.strip()
                for x in doc.xpath('//div[@class="schedule"]//text()')
                if x.strip()
            )

            # Skip events with no name
            if not name:
                continue

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        info.xpath('span[@class="col02"]/text()')[0],
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=doc.xpath(
                    '//div[@class="heading-container"]/span/text()'
                )[0].title()
            )

            event.add_participant(
                info.xpath('span[@class="col01"]/text()')[0].title(),
                type='committee',
                note='host',
            )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                    document.xpath('text()')[0],
                    url=document.xpath('@href')[0]
                )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            yield event
Example #7
0
    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(datetime.datetime.strptime(when,
                                                                                  '%b %d, %Y')),
                          location_name='State Capitol'
                          )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name, type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        note=document.xpath('text()')[0],
                        url=url,
                        media_type='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event
Example #8
0
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description,
                      start_date=datetime,
                      location_name=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant(ctty_name, 'organization')

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
Example #9
0
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath('string(Sponsor)').strip()
        committee_name = '{} {}'.format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]['name']
            )

        name = '{} {}'.format(
            self.COMMITTEES_PRETTY[chamber],
            row.xpath('string(Title)').strip()
        )

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == '':
            name = committee_name

        location = row.xpath('string(Location)').strip()

        # events with no location all seem to be committee hearings
        if location == '':
            location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801'

        start_date = dateutil.parser.parse(row.xpath('string(Schedule)'))
        # todo: do i need to self._TZ.localize() ?

        event = Event(
            start_date=start_date,
            name=name,
            location_name=location
        )

        event.add_source('http://w3.akleg.gov/index.php#tab4')

        event.add_participant(
            committee_name,
            type='committee',
            note='host',
        )

        for item in row.xpath('Agenda/Item'):
            agenda_desc = item.xpath('string(Text)').strip()
            if agenda_desc != '':
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath('BillRoot'):
                    bill_id = item.xpath('string(BillRoot)')
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r'\s+', ' ', bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Example #10
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, 'Committee:')
        when_date = self.table_row_content(page, 'Date:')
        when_time = self.table_row_content(page, 'Time:')
        location = self.table_row_content(page, 'Location:')

        if 'house hearing room' in location.lower():
            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(' :', ':')

        # some times have extra info after the AM/PM
        if 'upon' in when_time:
            when_time = when_time.split('AM', 1)[0]
            when_time = when_time.split('PM', 1)[0]

        start_date = self._TZ.localize(
            dateutil.parser.parse('{} {}'.format(when_date, when_time))
        )

        event = Event(
            start_date=start_date,
            name=com,
            location_name=location
        )

        event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx')

        event.add_participant(
            com,
            type='committee',
            note='host',
        )

        # different from general MO link xpath due to the <b>
        house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \
            'or contains(@href, "bill.aspx")]/b/text()'

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split('--')[0].strip()
            bill_no = bill_no.replace('HCS', '').strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Example #11
0
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                'MISCELLANEOUS',
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Example #12
0
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub(r"\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(
            dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description, start_date=datetime, location_name=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant(ctty_name, 'organization')

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
    def scrape_events(self, chamber, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        # some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                # type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)

                if where is None or where == "":
                    where = 'State House'
                event = Event(name=description,
                              start_date=when,
                              location_name=where)
                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant(name,
                                                  type='committee',
                                                  note='host')

                event.add_source(url)
                yield event
        else:
            # hack so we dont fail on the first id numbers where there are some gaps
            # between the numbers that work and not.
            if event_id > 1700:
                raise Exception(
                    "Parsing is done we are on future ids that are not used yet."
                )
Example #14
0
    def scrape(self, session=None, chamber=None):
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO(page.text), delimiter='|')

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += next(page)

            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')
                when = self._tz.localize(when)
                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                description = "%s MEETING" % comm
                event = Event(
                        name=description,
                        start_time=when,
                        location_name=location,
                        description=description,
                        timezone=self._tz.zone
                )
                event.add_source(url)

                event.add_participant(comm, type='committee', note='host')
                # time = row[3].strip()
                # if time in TIMECODES:
                #     event['notes'] = TIMECODES[time]

                yield event
Example #15
0
    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, '//wa:CommitteeMeeting'):
            event_cancelled = xpath(row, 'string(wa:Cancelled)')
            if event_cancelled == 'true':
                continue

            event_chamber = xpath(row, 'string(wa:Agency)')
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, 'string(wa:Date)'), "%Y-%m-%dT%H:%M:%S")
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, 'string(wa:Committees/'
                                   'wa:Committee/wa:LongName)')
            agenda_id = xpath(row, 'string(wa:AgendaId)')
            notes = xpath(row, 'string(wa:Notes)')
            room = xpath(row, 'string(wa:Room)')
            building = xpath(row, 'string(wa:Building)')
            # XML has a wa:Address but it seems useless
            city = xpath(row, 'string(wa:City)')
            state = xpath(row, 'string(wa:State)')

            location = '{}, {}, {} {}'.format(
                room,
                building,
                city,
                state
            )

            event = Event(name=event_com, start_date=event_date,
                          location_name=location,
                          description=notes)

            source_url = 'https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}'.format(
                agenda_id)
            event.add_source(source_url)

            event.add_participant(event_com, type='committee', note='host')

            event.extras['agendaId'] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event
Example #16
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub(r"\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo['COMMITTEE']
        where = metainfo['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo['PLACE'] = where.strip()
            metainfo['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo['CHAIR']

        plaintext = re.sub(r"\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee,
            start_date=self._tz.localize(datetime),
            location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type='committee', note='host')
        if chair is not None:
            event.add_participant(chair, type='legislator', note='chair')

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item('Bill up for discussion')
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event
Example #17
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo['COMMITTEE']
        where = metainfo['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo['PLACE'] = where.strip()
            metainfo['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee,
            start_date=self._tz.localize(datetime),
            location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type='committee', note='host')
        if chair is not None:
            event.add_participant(chair, type='legislator', note='chair')

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item('Bill up for discussion')
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event
Example #18
0
    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, "//wa:CommitteeMeeting"):
            event_cancelled = xpath(row, "string(wa:Cancelled)")
            if event_cancelled == "true":
                continue

            event_chamber = xpath(row, "string(wa:Agency)")
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, "string(wa:Date)"), "%Y-%m-%dT%H:%M:%S"
            )
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, "string(wa:Committees/" "wa:Committee/wa:LongName)")
            agenda_id = xpath(row, "string(wa:AgendaId)")
            notes = xpath(row, "string(wa:Notes)")
            room = xpath(row, "string(wa:Room)")
            building = xpath(row, "string(wa:Building)")
            # XML has a wa:Address but it seems useless
            city = xpath(row, "string(wa:City)")
            state = xpath(row, "string(wa:State)")

            location = "{}, {}, {} {}".format(room, building, city, state)

            event = Event(
                name=event_com,
                start_date=event_date,
                location_name=location,
                description=notes,
            )

            source_url = "https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}".format(
                agenda_id
            )
            event.add_source(source_url)

            event.add_participant(event_com, type="committee", note="host")

            event.extras["agendaId"] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event
Example #19
0
    def scrape_events(self, chamber, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        # some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                # type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)

                if where is None or where == "":
                    where = 'State House'
                event = Event(name=description,
                              start_date=when,
                              location_name=where)
                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant(name, type='committee',
                                                  note='host')

                event.add_source(url)
                yield event
        else:
            # hack so we dont fail on the first id numbers where there are some gaps
            # between the numbers that work and not.
            if event_id > 1700:
                raise Exception("Parsing is done we are on future ids that are not used yet.")
Example #20
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, 'Committee:')
        when_date = self.table_row_content(page, 'Date:')
        when_time = self.table_row_content(page, 'Time:')
        location = self.table_row_content(page, 'Location:')

        if 'house hearing room' in location.lower():
            location = '{}, {}'.format(
                location, '201 W Capitol Ave, Jefferson City, MO 65101')

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(' :', ':')

        # some times have extra info after the AM/PM
        if 'upon' in when_time:
            when_time = when_time.split('AM', 1)[0]
            when_time = when_time.split('PM', 1)[0]

        start_date = self._TZ.localize(
            dateutil.parser.parse('{} {}'.format(when_date, when_time)))

        event = Event(start_date=start_date, name=com, location_name=location)

        event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx')

        event.add_participant(
            com,
            type='committee',
            note='host',
        )

        # different from general MO link xpath due to the <b>
        house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \
            'or contains(@href, "bill.aspx")]/b/text()'

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split('--')[0].strip()
            bill_no = bill_no.replace('HCS', '').strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Example #21
0
    def scrape_chamber(self, chamber, session):
        cha = {"upper": "7", "lower": "3", "other": "4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now+timedelta(days=30)).strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        committees = page.xpath("//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath("//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading, content = meeting.xpath("./ul/li")
                who, when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p")

                location = (where or '').strip() or "unknown"

                event = Event(name=meeting_title, start_time=self._tz.localize(when),
                              timezone=self._tz.zone, location_name=location,
                              description=meeting_title)

                event.add_participant(who.strip(), type='committee', note='host')
                event.add_source(url)

                # only scraping public hearing bills for now.
                bills = meeting.xpath(".//div[text() = 'Public Hearing']/following-sibling::li"
                                      "[contains(@class, 'visible-lg')]")
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    item = event.add_agenda_item(descr.strip())
                    item.add_bill(bill_id.strip())

                yield event
Example #22
0
    def scrape(self):
        for event in self.events():
            e = Event(name=event["EventBodyName"],
                      start_time=event["start"],
                      timezone=self.TIMEZONE,
                      description='',
                      location_name=event["EventLocation"],
                      status=event["status"])
            
            for item in self.agenda(event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)


            e.add_participant(name=event["EventBodyName"],
                              type="organization")

            e.add_source('foo')

            meeting_detail_web = self.WEB_URL + '/MeetingDetail.aspx?ID={EventId}&GUID={EventGuid}'.format(**event)
            if requests.head(meeting_detail_web).status_code == 200:
                e.add_source(meeting_detail_web, note='web')
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**event),
                         note='api')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")


            yield e
Example #23
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, "Committee:")
        when_date = self.table_row_content(page, "Date:")
        when_time = self.table_row_content(page, "Time:")
        location = self.table_row_content(page, "Location:")

        if "house hearing room" in location.lower():
            location = "{}, {}".format(
                location, "201 W Capitol Ave, Jefferson City, MO 65101")

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(" :", ":")

        # some times have extra info after the AM/PM
        if "upon" in when_time:
            when_time = when_time.split("AM", 1)[0]
            when_time = when_time.split("PM", 1)[0]

        start_date = self._TZ.localize(
            dateutil.parser.parse("{} {}".format(when_date, when_time)))

        event = Event(start_date=start_date, name=com, location_name=location)

        event.add_source("https://house.mo.gov/HearingsTimeOrder.aspx")

        event.add_participant(com, type="committee", note="host")

        # different from general MO link xpath due to the <b>
        house_link_xpath = ('.//a[contains(@href, "Bill.aspx") '
                            'or contains(@href, "bill.aspx")]/b/text()')

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split("--")[0].strip()
            bill_no = bill_no.replace("HCS", "").strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Example #24
0
    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and not x.strip()[0].isdigit()
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )
                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
Example #25
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
    def scrape(self):
        for event, agenda in self.events() :

            description = None

            location_string = event[u'Meeting Location']

            location_list = location_string.split('--', 2)
            location = ', '.join(location_list[0:2])
            if not location :
                continue

            when = self.toTime(event[u'Meeting Date'])

            event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour,
                                minute=event_time.minute)

            status_string = location_list[-1].split('Chicago, Illinois')
            if len(status_string) > 1 and status_string[1] :
                status_text = status_string[1].lower()
                if any(phrase in status_text 
                       for phrase in ('rescheduled to',
                                      'postponed to',
                                      'reconvened to',
                                      'rescheduled to',
                                      'meeting recessed',
                                      'recessed meeting',
                                      'postponed to',
                                      'recessed until',
                                      'deferred',
                                      'time change',
                                      'date change',
                                      'recessed meeting - reconvene',
                                      'cancelled',
                                      'new date and time',
                                      'rescheduled indefinitely',
                                      'rescheduled for',)) :
                    status = 'cancelled'
                elif status_text in ('rescheduled', 'recessed') :
                    status = 'cancelled'
                elif status_text in ('meeting reconvened',
                                     'reconvened meeting',
                                     'recessed meeting',
                                     'reconvene meeting',
                                     'rescheduled hearing',
                                     'rescheduled meeting',) :
                    status = confirmedOrPassed(when)
                elif status_text in ('amended notice of meeting',
                                     'room change',
                                     'amended notice',
                                     'change of location',
                                     'revised - meeting date and time') :
                    status = confirmedOrPassed(when)
                elif 'room' in status_text :
                    location = status_string[1] + ', ' + location
                elif status_text in ('wrong meeting date',) :
                    continue
                else :
                    print(status_text)
                    description = status_string[1].replace('--em--', '').strip()
                    status = confirmedOrPassed(when)
            else :
                status = confirmedOrPassed(when)


            if description :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          description=description,
                          timezone='US/Central',
                          location_name=location,
                          status=status)
            else :
                e = Event(name=event["Name"]["label"],
                          start_time=when,
                          timezone='US/Central',
                          location_name=location,
                          status=status)


            if event['Video'] != 'Not\xa0available' : 
                e.add_media_link(note='Recording',
                                 url = event['Video']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Notice')
            self.addDocs(e, event, 'Transcript')
            self.addDocs(e, event, 'Summary')

            participant = event["Name"]["label"]
            if participant == 'City Council' :
                participant = 'Chicago City Council'
            elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
                participant = 'Committee on Energy, Environmental Protection and Public Utilities'

            e.add_participant(name=participant,
                              type="organization")

            if agenda :
                e.add_source(event['Meeting Details']['url'], note='web')
                
                for item, _, _ in agenda :
                    agenda_item = e.add_agenda_item(item["Title"])
                    if item["Record #"] :
                        identifier = item["Record #"]['label']
                        if identifier.startswith('S'):
                            identifier = identifier[1:]
                        agenda_item.add_bill(identifier)

            else :
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + dt.timedelta(n)

        for date in daterange(start_date, end_date):
            events = self.extract_events_by_day(date)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = dt.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
                org_name = event['meeting']
                e = Event(
                    name = org_name,
                    start_time = start,
                    timezone = tz.zone,
                    location_name = event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                    )
                e.add_source(source_url)
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                    }
                e.add_participant(
                    name = org_name,
                    type = 'organization',
                    )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw: raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        wards = normalize_wards(item['wards'])
                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)
                        if full_identifier not in self.seen_agenda_items:
                            b = Bill(
                                # TODO: Fix this hardcode
                                legislative_session = '2014-2018',
                                identifier = full_identifier,
                                title = item['title'],
                                from_organization = {'name': self.jurisdiction.name},
                                )
                            b.add_source(agenda_url)
                            b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
                            b.extras = {
                                'wards': wards,
                                }

                            self.seen_agenda_items.append(full_identifier)

                            yield b

                yield e
    def transform_parse(self, parsed_form, response):
        _source = {
            "url": response.url,
            "note": json.dumps({'office_name': parsed_form['office_name'],
                                'restriction_period': parsed_form['restriction_period'],
                                'name': parsed_form['name']},
                               sort_keys=True)
        }

        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['restriction_period']['restriction_period_begin_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['restriction_period']['restriction_period_begin_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            classification="post_employment",
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.extras['office_name'] = parsed_form["office_name"]

        registrant_name = ' '.join([s for s in
                                    [parsed_form['name']['name_first'],
                                     parsed_form['name']['name_middle'],
                                     parsed_form['name']['name_last']]
                                    if s is not None])

        _registrant = Person(
            name=registrant_name,
            source_identified=True
        )

        _office = Organization(
            name=parsed_form['office_name'],
            classification='office',
            parent_id=self.jurisdiction._senate,
            source_identified=True
        )

        _office.add_member(
            _registrant,
            role='employee',
            label='employee for {n}'.format(n=_office.name),
            end_date=parsed_form['restriction_period']['restriction_period_begin_date'],
        )

        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _post_employment_event = Event(
            name="{rn} - {rt}, {o} (via {a})".format(rn=_registrant.name,
                                                     rt="post-employment period",
                                                     o=_office.name,
                                                     a="Senate Office of Public Record"),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['restriction_period']['restriction_period_begin_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            end_time=datetime.strptime(
                parsed_form['restriction_period']['restriction_period_end_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            classification='post_employment'
        )

        _post_employment_event.add_participant(type=_registrant._type,
                                               id=_registrant._id,
                                               name=_registrant.name,
                                               note="registrant")

        _post_employment_event.extras['office_name'] = parsed_form["office_name"]

        _disclosure.add_disclosed_event(
            name=_post_employment_event.name,
            type=_post_employment_event._type,
            classification=_post_employment_event.classification,
            id=_post_employment_event._id
        )

        _post_employment_event.add_source(**_source)
        yield _post_employment_event

        _office.add_source(**_source)
        yield _office

        _registrant.add_source(**_source)
        yield _registrant

        _disclosure.add_source(**_source)
        yield _disclosure
Example #29
0
    def scrape_event_page(self, url, chamber):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        trs = page.xpath("//table[@id='frg_committeemeeting_MeetingTable']/tr")
        metainf = {}
        for tr in trs:
            tds = tr.xpath(".//td")
            if len(tds) <= 1:
                continue
            key = tds[0].text_content().strip()
            val = tds[1]
            metainf[key] = {
                "txt": val.text_content().strip(),
                "obj": val
            }

        if metainf == {}:
            return

        # Wednesday, 5/16/2012 3:00 pm
        datetime = "%s %s" % (
            metainf['Date']['txt'],
            metainf['Time']['txt'].replace(".", "")
        )
        if "Cancelled" in datetime:
            return

        translate = {
            "noon": " PM",
            "a.m.": " AM",
            "am": " AM",  # This is due to a nasty line they had.
            "a.m": "AM"  # another weird one

        }

        for t in translate:
            if t in datetime:
                datetime = datetime.replace(t, translate[t])

        datetime = re.sub(r"\s+", " ", datetime)

        for text_to_remove in [
                "or after committees are given leave",
                "or later immediately after committees are given leave",
                "or later after committees are given leave by the House to meet",
                "**Please note time**"]:
            datetime = datetime.split(text_to_remove)[0].strip()

        datetime = datetime.replace('p.m.', 'pm')
        datetime = datetime.replace('Noon', "pm")
        try:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
        except ValueError:
            datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
        where = metainf['Location']['txt']
        title = metainf['Committee']['txt']  # XXX: Find a better title

        if chamber == 'other':
            chamber = 'joint'

        event = Event(
            name=title,
            start_date=self._tz.localize(datetime),
            location_name=where,
        )
        event.add_source(url)
        event.add_source(mi_events)

        chair_name = metainf['Chair']['txt'].strip()
        if chair_name:
            event.add_participant(chair_name, type='legislator', note='chair')
        else:
            self.warning("No chair found for event '{}'".format(title))

        event.add_participant(metainf['Committee']['txt'], type='committee', note='host')

        agenda = metainf['Agenda']['obj']
        agendas = agenda.text_content().split("\r")

        related_bills = agenda.xpath("//a[contains(@href, 'getObject')]")
        for bill in related_bills:
            description = agenda
            for a in agendas:
                if bill.text_content() in a:
                    description = a

            item = event.add_agenda_item(description)
            item.add_bill(bill.text_content())

        yield event
    def transform_parse(self, parsed_form, response):

        _source = {
            "url": response.url,
            "note": "LDA Form LD-1"
        }

        # basic disclosure fields
        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['datetimes']['signature_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification="lobbying"
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.add_identifier(
            identifier=parsed_form['_meta']['document_id'],
            scheme="urn:sopr:filing"
        )

        # disclosure extras
        _disclosure.extras = {}
        _disclosure.extras['registrant'] = {
            'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
            'general_description': parsed_form['registrant']['registrant_general_description'],
            'signature': {
                "signature_date": parsed_form['datetimes']['signature_date'],
                "signature": parsed_form['signature']
            }
        }

        _disclosure.extras['client'] = {
            'same_as_registrant':
                parsed_form['client']['client_self'],
            'general_description':
                parsed_form['client']['client_general_description']
        }

        _disclosure.extras['registration_type'] = {
            'is_amendment':
                parsed_form['registration_type']['is_amendment'],
            'new_registrant':
                parsed_form['registration_type']['new_registrant'],
            'new_client_for_existing_registrant':
                parsed_form['registration_type'][
                    'new_client_for_existing_registrant'],
        }

        # # Registrant
        # build registrant
        _registrant_self_employment = None

        if parsed_form['registrant']['self_employed_individual']:
            n = ' '.join([p for p in [
                parsed_form['registrant']['registrant_individual_prefix'],
                parsed_form['registrant']['registrant_individual_firstname'],
                parsed_form['registrant']['registrant_individual_lastname']
            ] if len(p) > 0]).strip()

            _registrant = Person(
                name=n,
                source_identified=True
            )

            _registrant_self_employment = Organization(
                name='SELF-EMPLOYMENT of {n}'.format(n=n),
                classification='company',
                source_identified=True
            )

            _registrant.add_membership(
                organization=_registrant_self_employment,
                role='self_employed',
                label='self-employment of {n}'.format(n=n),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant = Organization(
                name=parsed_form['registrant']['registrant_org_name'],
                classification='company',
                source_identified=True
            )

        if len(parsed_form['registrant']['registrant_house_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_house_id'],
                scheme='urn:house_clerk:registrant'
            )

        if len(parsed_form['registrant']['registrant_senate_id']) > 0:
            _registrant.add_identifier(
                identifier=parsed_form['registrant']['registrant_senate_id'],
                scheme='urn:sopr:registrant'
            )

        registrant_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['registrant']['registrant_address_one'],
                        parsed_form['registrant']['registrant_address_two'],
                        parsed_form['registrant']['registrant_city'],
                        parsed_form['registrant']['registrant_state'],
                        parsed_form['registrant']['registrant_zip'],
                        parsed_form['registrant']['registrant_country']]
                    if len(p) > 0]).strip(),
            },
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            },
        ]

        registrant_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['registrant']['registrant_ppb_city'],
                    parsed_form['registrant']['registrant_ppb_state'],
                    parsed_form['registrant']['registrant_ppb_zip'],
                    parsed_form['registrant']['registrant_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if registrant_contact_ppb["value"]:
            registrant_contact_details.append(registrant_contact_ppb)

        for cd in registrant_contact_details:
            _registrant.add_contact_detail(**cd)

        _registrant.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address_one",
                            "value": parsed_form['registrant'][
                                'registrant_address_one'],
                        },
                        {
                            "note": "address_two",
                            "value": parsed_form['registrant'][
                                'registrant_address_two'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['registrant'][
                                'registrant_ppb_country'],
                        }
                    ],
                },
            ]
        }

        # # People
        # build contact
        _main_contact = Person(
            name=parsed_form['registrant']['registrant_contact_name'],
            source_identified=True
        )

        main_contact_contact_details = [
            {
                "type": "voice",
                "note": "contact phone",
                "value": parsed_form['registrant']['registrant_contact_phone'],
            },
            {
                "type": "email",
                "note": "contact email",
                "value": parsed_form['registrant']['registrant_contact_email'],
            }
        ]

        for cd in main_contact_contact_details:
            _main_contact.add_contact_detail(**cd)

        if _registrant._type == 'organization':
            _registrant.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )
        else:
            _registrant_self_employment.add_member(
                name_or_person=_main_contact,
                role='main_contact',
                label='main contact for {n}'.format(n=_registrant.name),
                start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
            )

        # # Client
        # build client
        _client = Organization(
            name=parsed_form['client']['client_name'],
            classification='company',
            source_identified=True
        )

        client_contact_details = [
            {
                "type": "address",
                "note": "contact address",
                "value": '; '.join([
                    p for p in [
                        parsed_form['client']['client_address'],
                        parsed_form['client']['client_city'],
                        parsed_form['client']['client_state'],
                        parsed_form['client']['client_zip'],
                        parsed_form['client']['client_country']]
                    if len(p) > 0]).strip(),
            },
        ]

        client_contact_ppb = {
            "type": "address",
            "note": "principal place of business",
            "value": '; '.join([
                p for p in [
                    parsed_form['client']['client_ppb_city'],
                    parsed_form['client']['client_ppb_state'],
                    parsed_form['client']['client_ppb_zip'],
                    parsed_form['client']['client_ppb_country']]
                if len(p) > 0]).strip(),
        }

        if client_contact_ppb["value"]:
            client_contact_details.append(client_contact_ppb)

        for cd in client_contact_details:
            _client.add_contact_detail(**cd)

        _client.extras = {
            "contact_details_structured": [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": parsed_form['client']['client_address'],
                        },
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client']['client_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value": parsed_form['client']['client_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value": parsed_form['client']['client_ppb_state'],
                        },
                        {
                            "note": "zip",
                            "value": parsed_form['client']['client_ppb_zip'],
                        },
                        {
                            "note": "country",
                            "value": parsed_form['client'][
                                'client_ppb_country'],
                        }
                    ],
                },
            ],
        }

        # Collect Foreign Entities
        _foreign_entities = []
        _foreign_entities_by_name = {}
        for fe in parsed_form['foreign_entities']:
            fe_extras = {}
            fe_name = fe['foreign_entity_name']

            # check for name-based duplicates
            if fe_name in _foreign_entities_by_name:
                _foreign_entity = _foreign_entities_by_name[fe_name]
            else:
                _foreign_entity = Organization(
                    name=fe_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            foreign_entity_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_address'],
                            fe['foreign_entity_city'],
                            fe['foreign_entity_state'],
                            fe['foreign_entity_country']]
                        if len(p) > 0]).strip(),
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "value": '; '.join([
                        p for p in [
                            fe['foreign_entity_ppb_state'],
                            fe['foreign_entity_ppb_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            foreign_entity_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        fe['foreign_entity_ppb_city'],
                        fe['foreign_entity_ppb_state'],
                        fe['foreign_entity_ppb_country']]
                    if len(p) > 0]),
            }

            if foreign_entity_contact_ppb["value"]:
                foreign_entity_contact_details.append(
                    foreign_entity_contact_ppb)

            # add contact details
            for cd in foreign_entity_contact_details:
                if cd['value'] != '':
                    _foreign_entity.add_contact_detail(**cd)

            # add extras
            fe_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": fe['foreign_entity_address'],
                        },
                        {
                            "note": "city",
                            "value": fe['foreign_entity_city'],
                        },
                        {
                            "note": "state",
                            "value": fe['foreign_entity_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "state",
                            "value": fe['foreign_entity_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value": fe['foreign_entity_ppb_country'],
                        }
                    ],
                },
            ]

            _foreign_entity.extras = combine_dicts(_foreign_entity.extras,
                                                   fe_extras)

            _foreign_entities_by_name[fe_name] = _foreign_entity

        for unique_foreign_entity in _foreign_entities_by_name.values():
            _foreign_entities.append(unique_foreign_entity)

            # TODO: add a variant on memberships to represent inter-org
            # relationships (associations, ownership, etc)
            #
            # _client['memberships'].append({
            #     "id": _foreign_entity['id'],
            #     "classification": "organization",
            #     "name": _foreign_entity['name'],
            #     "extras": {
            #         "ownership_percentage":
            #             fe['foreign_entity_amount']
            #     }
            # })

        # Collect Lobbyists
        # TODO: deal with wierd non-name line continuation cases (blanks, "continued")
        _lobbyists_by_name = {}

        for l in parsed_form['lobbyists']:
            l_extras = {}
            l_name = ' '.join([l['lobbyist_first_name'],
                               l['lobbyist_last_name'],
                               l['lobbyist_suffix']
                               ]).strip()

            if l_name in _lobbyists_by_name:
                _lobbyist = _lobbyists_by_name[l_name]
            else:
                _lobbyist = Person(
                    name=l_name,
                    source_identified=True
                )

            if l['lobbyist_covered_official_position']:
                l_extras['lda_covered_official_positions'] = [
                    {
                        'date_reported':
                            parsed_form['datetimes']['effective_date'],
                        'covered_official_position':
                            l['lobbyist_covered_official_position']
                    },
                ]

            _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras)

            _lobbyists_by_name[l_name] = _lobbyist

        _lobbyists = []
        for unique_lobbyist in _lobbyists_by_name.values():
            _lobbyists.append(unique_lobbyist)

        if _registrant._type == 'organization':
            for l in _lobbyists:
                _registrant.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )
        else:
            for l in _lobbyists:
                _registrant_self_employment.add_member(
                    l,
                    role='lobbyist',
                    label='lobbyist for {n}'.format(n=_registrant.name),
                    start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
                )

        # # Document
        # build document
        _disclosure.add_document(
            note='submitted filing',
            date=parsed_form['datetimes']['effective_date'][:10],
            url=response.url
        )

        # Collect Affiliated orgs
        _affiliated_organizations = []
        _affiliated_organizations_by_name = {}
        for ao in parsed_form['affiliated_organizations']:
            ao_extras = {}
            ao_name = ao['affiliated_organization_name']
            if ao_name in _affiliated_organizations_by_name:
                # There's already one by this name
                _affiliated_organization = _affiliated_organizations_by_name[ao_name]
            else:
                # New affiliated org
                _affiliated_organization = Organization(
                    name=ao_name,
                    classification='company',
                    source_identified=True
                )

            # collect contact details
            affiliated_organization_contact_details = [
                {
                    "type": "address",
                    "note": "contact address",
                    "value": '; '.join([
                        p for p in [
                            ao['affiliated_organization_address'],
                            ao['affiliated_organization_city'],
                            ao['affiliated_organization_state'],
                            ao['affiliated_organization_zip'],
                            ao['affiliated_organization_country']]
                        if len(p) > 0]).strip(),
                },
            ]

            affiliated_organization_contact_ppb = {
                "type": "address",
                "note": "principal place of business",
                "value": '; '.join([
                    p for p in [
                        ao['affiliated_organization_ppb_city'],
                        ao['affiliated_organization_ppb_state'],
                        ao['affiliated_organization_ppb_country']]
                    if len(p) > 0]).strip(),
            }

            if affiliated_organization_contact_ppb["value"]:
                affiliated_organization_contact_details.append(
                    affiliated_organization_contact_ppb)

            # add contact details
            for cd in affiliated_organization_contact_details:
                _affiliated_organization.add_contact_detail(**cd)

            ao_extras["contact_details_structured"] = [
                {
                    "type": "address",
                    "note": "contact address",
                    "parts": [
                        {
                            "note": "address",
                            "value": ao['affiliated_organization_address'],
                        },
                        {
                            "note": "city",
                            "value": ao['affiliated_organization_city'],
                        },
                        {
                            "note": "state",
                            "value": ao['affiliated_organization_state'],
                        },
                        {
                            "note": "zip",
                            "value": ao['affiliated_organization_zip'],
                        },
                        {
                            "note": "country",
                            "value": ao['affiliated_organization_country'],
                        }
                    ],
                },
                {
                    "type": "address",
                    "note": "principal place of business",
                    "parts": [
                        {
                            "note": "city",
                            "value":
                                ao['affiliated_organization_ppb_city'],
                        },
                        {
                            "note": "state",
                            "value":
                                ao['affiliated_organization_ppb_state'],
                        },
                        {
                            "note": "country",
                            "value":
                                ao['affiliated_organization_ppb_country'],
                        }
                    ],
                },
            ],

            _affiliated_organization.extras = combine_dicts(
                _affiliated_organization.extras, ao_extras)

        for unique_affiliated_organization in _affiliated_organizations_by_name.values():
            _affiliated_organizations.append(unique_affiliated_organization)

        # # Events & Agendas
        # name
        if parsed_form['registration_type']['new_registrant']:
            registration_type = 'New Client, New Registrant'
        elif parsed_form['registration_type']['is_amendment']:
            registration_type = 'Amended Registration'
        else:
            registration_type = 'New Client for Existing Registrant'

        # Create registration event
        _event = Event(
            name="{rn} - {rt}, {cn}".format(rn=_registrant.name,
                                            rt=registration_type,
                                            cn=_client.name),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['datetimes']['effective_date'],
                '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
            classification='registration'
        )

        # add participants
        _event.add_participant(type=_registrant._type,
                               id=_registrant._id,
                               name=_registrant.name,
                               note="registrant")

        if _registrant._type == 'person':
            _event.add_participant(type=_registrant._type,
                                   id=_registrant._id,
                                   name=_registrant.name,
                                   note="registrant")

        _event.add_participant(type=_client._type,
                               id=_client._id,
                               name=_client.name,
                               note="client")

        for l in _lobbyists:
            _event.add_participant(type=l._type,
                                   id=l._id,
                                   name=l.name,
                                   note='lobbyist')

        for fe in _foreign_entities:
            _event.add_participant(type=fe._type,
                                   id=fe._id,
                                   name=fe.name,
                                   note='foreign_entity')

        for ao in _affiliated_organizations:
            _event.add_participant(type=ao._type,
                                   id=ao._id,
                                   name=ao.name,
                                   note='affiliated_organization')

        # add agenda item
        _agenda = _event.add_agenda_item(
            description='issues lobbied on',
        )

        _agenda['notes'].append(
            parsed_form['lobbying_issues_detail']
        )

        for li in parsed_form['lobbying_issues']:
            if li['general_issue_area'] != '':
                _agenda.add_subject(li['general_issue_area'])

        _disclosure.add_disclosed_event(
            name=_event.name,
            type=_event._type,
            classification=_event.classification,
            id=_event._id
        )

        # add registrant to disclosure's _related and related_entities fields
        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _registrant.add_source(
            url=_source['url'],
            note='registrant'
        )
        yield _registrant

        if _registrant_self_employment is not None:
            _registrant_self_employment.add_source(
                url=_source['url'],
                note='registrant_self_employment'
            )

            yield _registrant_self_employment

        _client.add_source(
            url=_source['url'],
            note='client'
        )
        yield _client

        _main_contact.add_source(
            url=_source['url'],
            note='main_contact'
        )
        yield _main_contact

        for ao in _affiliated_organizations:
            ao.add_source(
                url=_source['url'],
                note='affiliated_organization'
            )
            yield ao
        for fe in _foreign_entities:
            fe.add_source(
                url=_source['url'],
                note='foreign_entity'
            )
            yield fe
        for l in _lobbyists:
            l.add_source(
                url=_source['url'],
                note='lobbyist'
            )
            yield l

        _event.add_source(**_source)
        yield _event
        _disclosure.add_source(**_source)
        yield _disclosure
    def scrape_events_range(self, start_date, end_date):

        def daterange(start_date, end_date):
            number_of_days = int((end_date - start_date).days)
            for n in range(number_of_days):
                yield start_date + datetime.timedelta(n)

        for date in daterange(start_date, end_date):
            calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
            events = self.extract_events_by_url(calendar_day_url)
            for event in events:
                tz = pytz.timezone("America/Toronto")
                time = datetime.datetime.strptime(event['time'], '%I:%M %p')
                start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
                org_name = event['meeting']
                e = Event(
                    name=org_name,
                    start_time=start,
                    timezone=tz.zone,
                    location_name=event['location'],
                    status=STATUS_DICT.get(event['meeting_status'])
                )
                e.extras = {
                    'meeting_number': event['no'],
                    'tmmis_meeting_id': event['meeting_id'],
                }
                e.add_source(calendar_day_url)
                e.add_participant(
                    name=org_name,
                    type='organization',
                )

                def is_agenda_available(event):
                    return event['publishing_status'] in ['Agenda Published', 'Minutes Published']

                def is_council(event):
                    return True if event['meeting'] == self.jurisdiction.name else False

                if is_agenda_available(event):
                    agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
                    agenda_url = agenda_url_template.format(event['meeting_id'])
                    full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))

                    e.add_source(agenda_url)
                    agenda_items = self.agenda_from_url(agenda_url)
                    for i, item in enumerate(agenda_items):

                        a = e.add_agenda_item(item['title'])
                        a.add_classification(item['type'].lower())
                        a['order'] = str(i)

                        def normalize_wards(raw):
                            if not raw:
                                raw = 'All'
                            if raw == 'All':
                                return raw.lower()
                            else:
                                return raw.split(', ')

                        identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
                        [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
                        a.add_bill(full_identifier)

                yield e
Example #32
0
    def scrape_chamber(self, chamber, session):
        today = datetime.date.today()
        start_date = today - datetime.timedelta(days=10)
        end_date = today + datetime.timedelta(days=10)

        if chamber == 'upper':
            chamber_abbrev = 'S'
        else:
            chamber_abbrev = 'H'

        url = ("http://www.legis.iowa.gov/committees/meetings/meetingsList"
               "Chamber?chamber=%s&bDate=%02d/%02d/"
               "%d&eDate=%02d/%02d/%d" % (chamber_abbrev,
                                          start_date.month,
                                          start_date.day,
                                          start_date.year,
                                          end_date.month,
                                          end_date.day,
                                          end_date.year))

        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)
        for link in page.xpath("//div[contains(@class, 'meetings')]/table[1]/"
                               "tbody/tr[not(contains(@class, 'hidden'))]"):
            comm = link.xpath("string(./td[2]/a[1]/text())").strip()
            desc = comm + " Committee Hearing"

            location = link.xpath("string(./td[3]/text())").strip()

            when = link.xpath("string(./td[1]/span[1]/text())").strip()

            if 'cancelled' in when.lower() or "upon" in when.lower():
                continue
            if "To Be Determined" in when:
                continue

            if 'AM' in when:
                when = when.split('AM')[0] + " AM"
            else:
                when = when.split('PM')[0] + " PM"

            junk = ['Reception']
            for key in junk:
                when = when.replace(key, '')

            when = re.sub("\s+", " ", when).strip()
            if "tbd" in when.lower():
                # OK. This is a partial date of some sort.
                when = datetime.datetime.strptime(
                    when,
                    "%m/%d/%Y TIME - TBD %p"
                )
            else:
                try:
                    when = datetime.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
                except ValueError:
                    try:
                        when = datetime.datetime.strptime(when, "%m/%d/%Y %I %p")
                    except ValueError:
                        self.warning('error parsing timestamp %s', when)
                        continue

            event = Event(
                    name=desc,
                    description=desc,
                    start_date=self._tz.localize(when),
                    location_name=location)

            event.add_source(url)
            event.add_participant(comm, note='host', type='committee')

            yield event
Example #33
0
    def scrape_chamber(self, chamber):
        """
        Scrape upper or lower committee agendas
        """
        # session = self.latest_session()
        # since we are scraping only latest_session
        # session_id = self.session_metadata.session_id_meta_data[session]

        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % self._chamber_short[chamber]
        html_ = self.get(url).text
        doc = html.fromstring(html_)
        if chamber == 'upper':
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/'
                                    'tr/td/table/tr/td/table')[0]
        else:
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                    '/td/table/tr/td/table/tr/td/table')[0]
        for row in event_table.xpath('tr')[2:]:
            # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
            # HTML Document, PDF Document for house
            # Agenda Date, Committee, Revised, Cancelled, Time, Room,
            # HTML Document, PDF Document for senate
            text = [x.text_content().strip() for x in row.xpath('td')]
            when, committee = text[0:2]
            if chamber == 'upper':
                time, room = text[4:6]
                link = row[6].xpath('string(a/@href)')
            else:
                time, room = text[5:7]
                link = row[7].xpath('string(a/@href)')
            if 'NOT MEETING' in time or 'CANCELLED' in time:
                continue
            time = re.match(r'(\d+:\d+ (A|P))', time)
            if time:
                when = "%s %sM" % (text[0], time.group(0))
                when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
            else:
                when = text[0]
                when = datetime.datetime.strptime(when, '%m/%d/%Y')

            title = "Committee Meeting:\n%s %s %s\n" % (
                                              self._chamber_long[chamber],
                                              committee, room)
            agenda_info = self.parse_agenda(chamber, link)

            description = agenda_info['description']
            member_list = agenda_info['member_list']
            related_bills = agenda_info['related_bills']
            print(related_bills)
            """
            event = Event(session, when, 'committee:meeting', title,
                          location=room, link=link, details=description,
                          related_bills=related_bills)
            """
            event = Event(location_name=room,
                          start_date=self._tz.localize(when),
                          name=title,
                          description=description,
                          )
            event.add_participant(committee, type='committee', note='host')

            event.participants.extend(member_list)
            event.add_source(url)
            event.add_source(link)
            # print event['when'].timetuple()
            # import ipdb;ipdb.set_trace()
            yield event
Example #34
0
    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                    r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''',
                                    event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                                time + "_" + date,
                                '%I:%M %p_%A, %B %d, %Y'
                                )
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                                x.strip() for x in
                                description.split('\n')
                                if x.strip() and
                                not x.strip().startswith("Page ") and
                                not x.strip().startswith("*Possible Vote") and
                                not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                                ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(
                                name=description,
                                start_date=time,
                                location_name=location,
                                description=description
                        )

                        event.add_source(PDF_URL)
                        event.add_participant(comm, type='committee', note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                                     line)
                            if related_bill:
                                (related_bill, relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
Example #35
0
    def scrape_meeting(self, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        title = page.xpath("//a[@id='linkTitle']//text()")[0]
        date = page.xpath("//span[@id='lDate']/text()")[0]
        time = page.xpath("//span[@id='lTime']/text()")[0]
        location = page.xpath("//span[@id='lLocation']/text()")[0]

        substs = {
            "AM": ["A.M.", "a.m."],
            "PM": ["P.M.", "p.m.", "Noon"],
        }

        for key, values in substs.items():
            for value in values:
                time = time.replace(value, key)

        # Make sure there's a space between the time's minutes and its AM/PM
        if re.search(r'(?i)\d[AP]M$', time):
            time = time[:-2] + " " + time[-2:]

        if re.search("UPON ADJ|TBA", ' '.join(time.split()).upper()):
            all_day = True
            when = datetime.datetime.strptime(date, "%B %d, %Y")
        else:
            all_day = False
            when = datetime.datetime.strptime("%s %s" % (
                date, time
            ), "%B %d, %Y %I:%M %p")

        # when = self._tz.localize(when)

        description = "Meeting on %s of the %s" % (date, title)
        chambers = {"house": "lower",
                    "senate": "upper",
                    "joint": "legislature"}

        for chamber_ in chambers.keys():
            if chamber_ in title.lower():
                break
        else:
            return

        event = Event(name=description,
                      start_date=self._tz.localize(when),
                      location_name=location,
                      all_day=all_day)
        event.add_source(url)

        event.add_participant(title, note='host', type='committee')

        trs = iter(page.xpath("//tr[@valign='top']"))
        next(trs)

        for tr in trs:
            try:
                _, _, bill, whom, descr = tr.xpath("./td")
            except ValueError:
                continue

            bill_title = bill.text_content()

            if "S" in bill_title or "H" in bill_title:
                item = event.add_agenda_item(descr.text_content())
                item.add_bill(bill_title)
            else:
                continue

        yield event
Example #36
0
    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']

        # check for duration in time
        if ' - ' in time:
            start, end = time.split(' - ')
            am_pm_srch = re.search('(?i)(am|pm)', end)
            if am_pm_srch:
                time = ' '.join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        event_desc = "Meeting Notice"
        if 'Rise' in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(
            name=event_desc,
            start_date=self._tz.localize(datetime),
            location_name=where,
        )
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(
                bill.text_content(), bill_ft,
                media_type="application/pdf")
            root = bill.xpath('../../*')
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext().text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, 'committee', note='host')

        yield event
Example #37
0
    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            begin = self._date_parse(event.pop('when'))
            end = self._date_parse(event.pop('end'))
            all_day = event.pop('all_day',False)

            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location_name=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=begin,
                      end_time=end,
                      all_day=all_day,)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e
    def transform_parse(self, parsed_form, response):
        _source = {
            "url": response.url,
            "note": json.dumps({'office_name': parsed_form['office_name'],
                                'termination_date': parsed_form['termination_date'],
                                'lobbying_eligibility_date': parsed_form['lobbying_eligibility_date'],
                                'name': parsed_form['employee_name']},
                               sort_keys=True)
        }

        _disclosure = Disclosure(
            effective_date=datetime.strptime(
                parsed_form['termination_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            timezone='America/New_York',
            submitted_date=datetime.strptime(
                parsed_form['termination_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            classification="post_employment",
        )

        _disclosure.add_authority(name=self.authority.name,
                                  type=self.authority._type,
                                  id=self.authority._id)

        _disclosure.extras['office_name'] = parsed_form["office_name"]

        _registrant = Person(
            name=parsed_form['employee_name'],
            source_identified=True
        )

        _office = Organization(
            name=parsed_form['office_name'],
            classification='office',
            parent_id=self.jurisdiction._house,
            source_identified=True
        )

        _office.add_member(
            _registrant,
            role='employee',
            label='employee for {n}'.format(n=_office.name),
            end_date=parsed_form['termination_date'],
        )

        _disclosure.add_registrant(name=_registrant.name,
                                   type=_registrant._type,
                                   id=_registrant._id)

        _post_employment_event = Event(
            name="{rn} - {rt}, {o} (via {a})".format(rn=_registrant.name,
                                                     rt="post-employment period",
                                                     o=_office.name,
                                                     a="House Clerk"),
            timezone='America/New_York',
            location='United States',
            start_time=datetime.strptime(
                parsed_form['termination_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            end_time=datetime.strptime(
                parsed_form['lobbying_eligibility_date'],
                '%Y-%m-%d').replace(tzinfo=UTC),
            classification='post_employment'
        )

        _post_employment_event.add_participant(type=_registrant._type,
                                               id=_registrant._id,
                                               name=_registrant.name,
                                               note="registrant")

        _post_employment_event.extras['office_name'] = parsed_form["office_name"]

        _disclosure.add_disclosed_event(
            name=_post_employment_event.name,
            type=_post_employment_event._type,
            classification=_post_employment_event.classification,
            id=_post_employment_event._id
        )

        _post_employment_event.add_source(**_source)
        yield _post_employment_event

        _office.add_source(**_source)
        yield _office

        _registrant.add_source(**_source)
        yield _registrant

        _disclosure.add_source(**_source)
        yield _disclosure