def scrape(self):
        for page in self.eventPages(EVENTSPAGE):
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table) :
                print(events)
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1] :
                    status = status_string[1].lower()
                    if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] :
                        print(status)
                        status = 'confirmed'
                else :
                    status = 'confirmed'



                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(time_string,
                                                        "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                e = Event(name=events["Name"]["label"],
                          when=when,
                          location=location,
                          status=status)
                e.add_source(EVENTSPAGE)
                if events['Video'] != u'Not\xa0available' :
                    print(events['Video'])

                yield e
Exemple #2
0
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(name=description, start_date=when.replace(tzinfo=self.tz),
                              description=description,
                              location_name=location)
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event
Exemple #3
0
    def scrape(self):
        for page in self.eventPages(EVENTSPAGE):
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table):
                print(events)
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1]:
                    status = status_string[1].lower()
                    if status not in [
                            'cancelled', 'tentative', 'confirmed', 'passed'
                    ]:
                        print(status)
                        status = 'confirmed'
                else:
                    status = 'confirmed'

                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(
                    time_string, "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                e = Event(name=events["Name"]["label"],
                          when=when,
                          location=location,
                          status=status)
                e.add_source(EVENTSPAGE)
                if events['Video'] != u'Not\xa0available':
                    print(events['Video'])

                yield e
Exemple #4
0
    def scrape(self):

        EVENTS_URL = 'http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx'
        rows = self.lxmlize(EVENTS_URL).xpath(
            '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr')
        for row in rows[1:]:
            date = row.xpath('td')[0].text_content().strip()
            time = row.xpath('td')[1].text_content().strip()

            date_with_time = '{} {}'.format(date, time)

            location = row.xpath('td')[2].text_content().strip()

            # 11 South Union Street, Montgomery, Alabama, United States
            # TODO: IF location is "room (X)" add state house
            # TODO: REplace "state house" with address

            # 32°22′37.294″N 86°17′57.991″W

            # host = row.xpath('td')[3].text_content().strip()
            name = row.xpath('td')[3].text_content().strip()
            details = row.xpath('td')[4].text_content().strip()

            event = Event(start_date=self._TZ.localize(
                datetime.datetime.strptime(
                    date_with_time,
                    self._DATETIME_FORMAT,
                )),
                          name=name,
                          location_name=location,
                          description=details)

            event.add_source(EVENTS_URL)

            yield event
Exemple #5
0
    def scrape_committee_events(self, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info(
                    "Cancelled event found; it will be skipped: {}".format(
                        info['title']))
                continue

            when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT)
            # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
            where = "{0} {1}".format(info['building'].strip(),
                                     info['location'].strip())
            # end_time=self._tz.localize(end),
            event = Event(
                start_time=self._tz.localize(when),
                timezone=self._tz.zone,
                location_name=where,
                name=info['title'],
                description=info['title'],
            )
            event.add_source(events_url)

            yield event
Exemple #6
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'
                      .format(item['CommitteeMeetingStatusName'])
                      )

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId'])
                    )

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
Exemple #7
0
    def scrape_upper(self):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])
            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(name=title,
                          start_date=self._tz.localize(when),
                          location_name=where)
            event.add_source(url)

            yield event
Exemple #8
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'.format(
                          item['CommitteeMeetingStatusName']))

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']),
                            id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId']))

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(name=description,
                              start_date=when.replace(tzinfo=self.tz),
                              description=description,
                              location_name=location)
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event
def test_no_location():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z',
    )
    e.add_source(url='http://example.com/foobar')
    e.validate()
    def scrape_upper(self):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])
            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour,
                                       minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(name=title,
                          start_date=self._tz.localize(when),
                          location_name=where)
            event.add_source(url)

            yield event
Exemple #12
0
    def scrape_committee_events(self, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info("Cancelled event found; it will be skipped: {}".
                          format(info['title']))
                continue

            when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT)
            # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
            where = "{0} {1}".format(info['building'].strip(), info['location'].strip())
            # end_time=self._tz.localize(end),
            event = Event(start_time=self._tz.localize(when),
                          timezone=self._tz.zone,
                          location_name=where,
                          name=info['title'],
                          description=info['title'],)
            event.add_source(events_url)

            yield event
Exemple #13
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
    def scrape(self):
        page = self.lxmlize(PAGE)
        events = page.xpath("//div[@class='col-middle']//ul/li")
        when = None
        for event in events:
            h3 = event.xpath("./a/h2")
            h3 = h3[0] if h3 else None
            if h3 is not None:
                when = h3.text
            else:
                if when is None:
                    self.warning("Ungrok!")
                    continue

                b, _, i = event.xpath("./p/*")
                title = b.text_content()
                event = i.text_content()

                if "NO MEETING" in event:
                    continue

                day, title = (x.strip() for x in title.split("-", 1))

                where = "Council Chambers"

                for subevent in (x.strip() for x in event.split(";")):
                    if " in " in subevent:
                        subevent, where = subevent.rsplit(" in ", 1)
                    subevent = subevent.replace(u'\xa0', ' ')

                    if "NO" in subevent and "MEETING" in subevent:
                        continue

                    if "to follow" in subevent:
                        continue

                    info = EVENT_RE.match(subevent).groupdict()
                    event, time = [info[x] for x in ['event', 'time']]

                    ampm = {
                        "a.m.": "AM",
                        "p.m.": "PM",
                    }

                    for old, new in ampm.items():
                        time = time.replace(old, new)

                    dtstring = ", ".join([day, time])

                    try:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I:%M %p")
                    except ValueError:
                        etime = dt.datetime.strptime(dtstring,
                                                     "%m/%d/%Y, %I%p")

                    e = Event(name=event, when=etime, location=where)
                    e.add_source(PAGE)
                    yield e
def event_obj():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z',
        location_name="Joe's Place",
    )
    e.add_source(url='http://example.com/foobar')
    return e
def event_obj():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z',
        location_name="Joe's Place",
    )
    e.add_source(url='http://example.com/foobar')
    return e
Exemple #17
0
    def scrape_event_page(self, url, event_type):
        page = self.lxmlize(url)
        page.make_links_absolute('https://malegislature.gov/')

        title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
        title = title.replace('Hearing Details', '').strip()
        title = title.replace('Special Event Details', '')

        start_day = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[2])').strip()
        start_time = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()

        location = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[4]//a)'
        ).strip()

        description = page.xpath(
            'string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()

        start_date = self._TZ.localize(
            dateutil.parser.parse('{} {}'.format(start_day, start_time), ))

        event = Event(start_date=start_date,
                      name=title,
                      location_name=location,
                      description=description)

        event.add_source(url)

        agenda_rows = page.xpath(
            '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
            '/div/div/div[contains(@class,"panel-default")]')

        for row in agenda_rows:
            # only select the text node, not the spans
            agenda_title = row.xpath(
                'string(.//h4/a/text()[normalize-space()])').strip()

            if agenda_title == '':
                agenda_title = row.xpath(
                    'string(.//h4/text()[normalize-space()])').strip()

            agenda = event.add_agenda_item(description=agenda_title)

            bills = row.xpath('.//tbody/tr/td[1]/a/text()')
            for bill in bills:
                bill = bill.strip().replace('.', ' ')
                agenda.add_bill(bill)

        if event_type == 'Hearing':
            event.add_participant(
                title,
                type='committee',
                note='host',
            )

        yield event
Exemple #18
0
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search('\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
Exemple #19
0
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
    def scrape(self):
        local_timezone =  pytz.timezone("US/Eastern")
        base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp"
        #things get messy more than a few months out
        #so we're just pulling 3 months. If we want three
        #more, they are called "nxx", "nxy" and "nxz"
        months = ["cur","nex","nxw"]
        for m in months:
            doc = self.lxmlize(base_calendar_url + "?next={}".format(m))
            events = doc.xpath("//table[contains(@style,'dotted #ccc')]")
            for event in events:
                rows = event.xpath(".//tr")
                for row in rows:
                    heading, data = row.xpath(".//td")
                    h = heading.text_content().lower().replace(":","").strip()
                    if h == "event":
                        title = data.text_content()
                        link = data.xpath(".//a")[0].attrib["href"]
                    elif h == "event date":
                        when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p')
                        when = local_timezone.localize(when)
                    elif h == "location":
                        where = data.text
                    elif h == "description":
                        description = data.text

                if link in DUPLICATE_EVENT_URLS:
                    continue

                if title == "Mayor's FY 2016-17 Proposed Budget Public Meeting":
                    continue

                if not description:
                    description = ""

                status = "confirmed"
                if "cancelled" in title.lower():
                    status = "cancelled"

                e = Event(name=title,
                            start_time=when,
                            timezone="US/Eastern",
                            location_name=where,
                            description=description,
                            status=status)
                
                e.add_source(link)
                yield e

            e = Event(name="Mayor's FY 2016-17 Proposed Budget Public Meeting",
                      start_time=local_timezone.localize(datetime.strptime('08/08/16 06:00PM', '%m/%d/%y %H:%M%p')),
                      timezone="US/Eastern",
                      location_name='111 NW 1st Street',
                      description='Pursuant to Section 2-1800A of the County Code, a Public Meeting has been scheduled by the Honorable Carlos A. Gimenez, Mayor, Miami-Dade County, to discuss the FY 2016-17 budget, tax rates, and fee changes.',
                      status='confirmed')
            e.add_source('http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=447192')
            yield e
Exemple #21
0
def event_obj():
    e = Event(
        name="get-together",
        start_time=datetime.datetime.utcnow(),
        location_name="Joe's Place",
        timezone="America/New_York",
    )
    e.add_source(url='foobar')
    return e
Exemple #22
0
    def scrape_upper(self):
        listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split('<hr />')[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, 'Date:')
            when_time = self.row_content(page, 'Time:')
            location = self.row_content(page, 'Room:')

            location = '{}, {}'.format(
                location, '201 W Capitol Ave, Jefferson City, MO 65101')

            # com = self.row_content(page, 'Committee:')
            com = page.xpath(
                '//td[descendant::b[contains(text(),"Committee")]]/a/text()'
            )[0]
            com = com.split(', Senator')[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse('{} {}'.format(when_date, when_time)))

            event = Event(start_date=start_date,
                          name=com,
                          location_name=location)

            event.add_source(listing_url)

            event.add_participant(
                com,
                type='committee',
                note='host',
            )

            for bill_table in page.xpath(
                    '//table[@width="85%" and @border="0"]'):
                bill_link = ''
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath('string(tr[2])').strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

                    bill_link = bill_table.xpath(
                        self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath('string(tr[1])').strip()
                    agenda_item = event.add_agenda_item(
                        description=agenda_line)

            yield event
Exemple #23
0
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source(
            'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(description,
                               item.xpath('@href')[0],
                               media_type="application/pdf",
                               on_duplicate="ignore")

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath(
                './/div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link('Video of Hearing',
                                 video[0].xpath('@href')[0], 'text/html')

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
Exemple #24
0
    def scrape_event_page(self, url, event_type):
        page = self.lxmlize(url)
        page.make_links_absolute('https://malegislature.gov/')

        title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
        title = title.replace('Hearing Details', '').strip()
        title = title.replace('Special Event Details', '')

        start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip()
        start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()

        location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip()

        description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()

        start_date = self._TZ.localize(
            dateutil.parser.parse(
                '{} {}'.format(start_day, start_time),
            )
        )

        event = Event(
            start_date=start_date,
            name=title,
            location_name=location,
            description=description
        )

        event.add_source(url)

        agenda_rows = page.xpath(
            '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
            '/div/div/div[contains(@class,"panel-default")]')

        for row in agenda_rows:
            # only select the text node, not the spans
            agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip()

            if agenda_title == '':
                agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip()

            agenda = event.add_agenda_item(description=agenda_title)

            bills = row.xpath('.//tbody/tr/td[1]/a/text()')
            for bill in bills:
                bill = bill.strip().replace('.', ' ')
                agenda.add_bill(bill)

        if event_type == 'Hearing':
            event.add_participant(
                title,
                type='committee',
                note='host',
            )

        yield event
Exemple #25
0
    def scrape(self, chamber=None):
        URL = "http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas"
        doc = self.lxmlize(URL)
        events = doc.xpath("//item")

        for info in events:
            title_and_date = info.xpath("title/text()")[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(
                name=title,
                start_date=self._tz.localize(
                    datetime.datetime.strptime(when, "%b %d, %Y")),
                location_name="State Capitol",
            )
            event.add_source(URL)

            url = re.search(r"(http://.*?)\s", info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name,
                                      type="committee",
                                      note="host")

            documents = doc.xpath(".//td")
            for document in documents:
                url = re.search(r"(http://.*?pdf)",
                                document.xpath("@onclick")[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                    note=document.xpath("text()")[0],
                    url=url,
                    media_type="application/pdf",
                )
                bills = document.xpath("@onclick")
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item("Bill up for discussion")
                        item.add_bill(bill_name)
            yield event
Exemple #26
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr,
                          start_time=when,
                          classification='committee-meeting',
                          description=descr,
                          location_name=where,
                          timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {
                        "chamber": "unknown",
                        "name": committee
                    })

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(bill['bill_id'], note=bill['type'])
            yield event
Exemple #27
0
    def scrape(self):
        calendar_url = "http://dccouncil.us/calendar"
        data = self.get(calendar_url).text
        doc = lxml.html.fromstring(data)

        committee_regex = re.compile("(Committee .*?)will")

        event_list = doc.xpath("//div[@class='event-description-dev']")
        for event in event_list:
            place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
            when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()])
            if len(place_and_time) > 2:
                location = place_and_time[2]
            else:
                location = "unknown"
            # when is now of the following format:
            # Wednesday, 2/25/2015 9:30am
            when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
            description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
            description_lines = description_content.xpath("./*")
            name = description_lines[0].text_content()
            desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
            description = re.sub(r'\s+', " ", description_content.text_content()).strip()
            potential_bills = description_content.xpath(".//li")

            committee = committee_regex.search(desc_without_title)
            event_type = 'other'
            if committee is not None:
                committee = committee.group(1).strip()
                event_type = 'committee:meeting'

            e = Event(name=name,
                      description=description,
                      start_date=self._tz.localize(when),
                      location_name=location,
                      classification=event_type,
                      )

            for b in potential_bills:
                bill = b.xpath("./a/text()")
                if len(bill) == 0:
                    continue
                bill = bill[0]
                bill_desc = b.text_content().replace(bill, "").strip(", ").strip()
                ses, num = bill.split("-")
                bill = ses.replace(" ", "") + "-" + num.zfill(4)
                item = e.add_agenda_item(bill_desc)
                item.add_bill(bill)

            e.add_source(calendar_url)

            if committee:
                e.add_participant(committee, type='organization', note='host')

            yield e
Exemple #28
0
    def scrape_upper(self):
        listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split('<hr />')[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, 'Date:')
            when_time = self.row_content(page, 'Time:')
            location = self.row_content(page, 'Room:')

            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

            # com = self.row_content(page, 'Committee:')
            com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0]
            com = com.split(', Senator')[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse('{} {}'.format(when_date, when_time))
            )

            event = Event(
                start_date=start_date,
                name=com,
                location_name=location
            )

            event.add_source(listing_url)

            event.add_participant(
                com,
                type='committee',
                note='host',
            )

            for bill_table in page.xpath('//table[@width="85%" and @border="0"]'):
                bill_link = ''
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath('string(tr[2])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

                    bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath('string(tr[1])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

            yield event
Exemple #29
0
    def scrape_chamber(self, chamber):
        url = utils.urls['events'][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath(
                '//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath(
                'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath('tr'):
                time_string = row.xpath(
                    'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = row.xpath(
                    'td[@class="CMS-MeetingDetail-Agenda"]/div/div'
                )[-1].text_content().strip()
                location = row.xpath('td[@class="CMS-MeetingDetail-Location"]'
                                     )[0].text_content().strip()
                committees = row.xpath(
                    './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a'
                )
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_time = datetime.datetime.strptime(
                        '{} {}'.format(date_string, time_string),
                        '%m/%d/%Y %I:%M %p',
                    )
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_time=self._tz.localize(start_time),
                    location_name=location,
                    timezone=self._tz.zone,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill('{}{} {}'.format(qs['body'], qs['type'],
                                                       qs['bn']))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r' \([S|H]\)$', '', committee.text),
                            id=qs.get('Code'),
                        )

                yield event
Exemple #30
0
    def scrape(self):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            name = " ".join(
                x.strip()
                for x in doc.xpath('//div[@class="schedule"]//text()')
                if x.strip()
            )

            # Skip events with no name
            if not name:
                continue

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        info.xpath('span[@class="col02"]/text()')[0],
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=doc.xpath(
                    '//div[@class="heading-container"]/span/text()'
                )[0].title()
            )

            event.add_participant(
                info.xpath('span[@class="col01"]/text()')[0].title(),
                type='committee',
                note='host',
            )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                    document.xpath('text()')[0],
                    url=document.xpath('@href')[0]
                )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            yield event
Exemple #31
0
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath(
            "//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p")
                      for x in (start, end))

        event = Event(name=title, location=ret['Where:'], when=start, end=end)
        event.add_source(url)
        yield event
Exemple #32
0
    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(datetime.datetime.strptime(when,
                                                                                  '%b %d, %Y')),
                          location_name='State Capitol'
                          )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name, type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        note=document.xpath('text()')[0],
                        url=url,
                        media_type='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event
Exemple #33
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Exemple #34
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end))

        event = Event( name=title, location=ret['Where:'], when=start, end=end)
        event.add_source(url)
        yield event
Exemple #36
0
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath("string(Sponsor)").strip()

        if committee_code in self.COMMITTEES[chamber]:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]["name"],
            )
        else:
            committee_name = "{} {}".format(
                self.COMMITTEES_PRETTY[chamber],
                'MISCELLANEOUS',
            )

        name = "{} {}".format(self.COMMITTEES_PRETTY[chamber],
                              row.xpath("string(Title)").strip())

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == "":
            name = committee_name

        location = row.xpath("string(Location)").strip()

        # events with no location all seem to be committee hearings
        if location == "":
            location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801"

        start_date = dateutil.parser.parse(row.xpath("string(Schedule)"))
        # todo: do i need to self._TZ.localize() ?

        event = Event(start_date=start_date, name=name, location_name=location)

        event.add_source("http://w3.akleg.gov/index.php#tab4")

        if committee_code in self.COMMITTEES[chamber]:
            event.add_participant(committee_name,
                                  type="committee",
                                  note="host")

        for item in row.xpath("Agenda/Item"):
            agenda_desc = item.xpath("string(Text)").strip()
            if agenda_desc != "":
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath("BillRoot"):
                    bill_id = item.xpath("string(BillRoot)")
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r"\s+", " ", bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Exemple #38
0
    def scrape_chamber(self, chamber, session):
        cha = {"upper": "7", "lower": "3", "other": "4"}[chamber]

        print_format = "%m/%d/%Y"
        now = dt.datetime.now()

        start = now.strftime(print_format)
        end = (now + timedelta(days=30)).strftime(print_format)
        url = event_page % (cha, start, end)

        page = self.lxmlize(url)

        committees = page.xpath(
            "//a[contains(@href,'Agendas?CommitteeId')]/@href")
        for comm in committees:
            comm_page = self.lxmlize(comm)
            meetings = comm_page.xpath(
                "//li[contains(@class, 'partialagendaitems')]")
            for meeting in meetings:
                heading, content = meeting.xpath("./ul/li")
                who, when = heading.text.split(" - ")
                meeting_title = "Scheduled meeting of %s" % who.strip()
                where_lines = content.text_content().split("\r\n")
                where = "\r\n".join([l.strip() for l in where_lines[6:9]])

                when = dt.datetime.strptime(when.strip(),
                                            "%m/%d/%Y %I:%M:%S %p")

                location = (where or '').strip() or "unknown"

                event = Event(name=meeting_title,
                              start_time=self._tz.localize(when),
                              timezone=self._tz.zone,
                              location_name=location,
                              description=meeting_title)

                event.add_participant(who.strip(),
                                      type='committee',
                                      note='host')
                event.add_source(url)

                # only scraping public hearing bills for now.
                bills = meeting.xpath(
                    ".//div[text() = 'Public Hearing']/following-sibling::li"
                    "[contains(@class, 'visible-lg')]")
                for bill in bills:
                    bill_id, descr = bill.xpath("./a/text()")[0].split(" - ")
                    item = event.add_agenda_item(descr.strip())
                    item.add_bill(bill_id.strip())

                yield event
Exemple #39
0
    def scrape_lower_item(self, page):
        # print(lxml.etree.tostring(page, pretty_print=True))
        com = self.table_row_content(page, 'Committee:')
        when_date = self.table_row_content(page, 'Date:')
        when_time = self.table_row_content(page, 'Time:')
        location = self.table_row_content(page, 'Location:')

        if 'house hearing room' in location.lower():
            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

        # fix some broken times, e.g. '12 :00'
        when_time = when_time.replace(' :', ':')

        # some times have extra info after the AM/PM
        if 'upon' in when_time:
            when_time = when_time.split('AM', 1)[0]
            when_time = when_time.split('PM', 1)[0]

        start_date = self._TZ.localize(
            dateutil.parser.parse('{} {}'.format(when_date, when_time))
        )

        event = Event(
            start_date=start_date,
            name=com,
            location_name=location
        )

        event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx')

        event.add_participant(
            com,
            type='committee',
            note='host',
        )

        # different from general MO link xpath due to the <b>
        house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \
            'or contains(@href, "bill.aspx")]/b/text()'

        for bill_title in page.xpath(house_link_xpath):
            bill_no = bill_title.split('--')[0].strip()
            bill_no = bill_no.replace('HCS', '').strip()

            agenda_item = event.add_agenda_item(description=bill_title)
            agenda_item.add_bill(bill_no)

        yield event
Exemple #40
0
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub("\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description,
                      start_date=datetime,
                      location_name=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant(ctty_name, 'organization')

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
Exemple #41
0
    def parse_event(self, row, chamber):
        # sample event available at http://www.akleg.gov/apptester.html
        committee_code = row.xpath('string(Sponsor)').strip()
        committee_name = '{} {}'.format(
                self.COMMITTEES_PRETTY[chamber],
                self.COMMITTEES[chamber][committee_code]['name']
            )

        name = '{} {}'.format(
            self.COMMITTEES_PRETTY[chamber],
            row.xpath('string(Title)').strip()
        )

        # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>"
        if name == '':
            name = committee_name

        location = row.xpath('string(Location)').strip()

        # events with no location all seem to be committee hearings
        if location == '':
            location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801'

        start_date = dateutil.parser.parse(row.xpath('string(Schedule)'))
        # todo: do i need to self._TZ.localize() ?

        event = Event(
            start_date=start_date,
            name=name,
            location_name=location
        )

        event.add_source('http://w3.akleg.gov/index.php#tab4')

        event.add_participant(
            committee_name,
            type='committee',
            note='host',
        )

        for item in row.xpath('Agenda/Item'):
            agenda_desc = item.xpath('string(Text)').strip()
            if agenda_desc != '':
                agenda_item = event.add_agenda_item(description=agenda_desc)
                if item.xpath('BillRoot'):
                    bill_id = item.xpath('string(BillRoot)')
                    # AK Bill ids have a bunch of extra spaces
                    bill_id = re.sub(r'\s+', ' ', bill_id)
                    agenda_item.add_bill(bill_id)

        yield event
Exemple #42
0
    def scrape_chamber(self, chamber):
        url = utils.urls['events'][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath('tr'):
                time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = row.xpath(
                    'td[@class="CMS-MeetingDetail-Agenda"]/div/div'
                )[-1].text_content().strip()
                location = row.xpath(
                    'td[@class="CMS-MeetingDetail-Location"]'
                )[0].text_content().strip()
                committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a')
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_time = datetime.datetime.strptime(
                        '{} {}'.format(date_string, time_string),
                        '%m/%d/%Y %I:%M %p',
                    )
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_time=self._tz.localize(start_time),
                    location_name=location,
                    timezone=self._tz.zone,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn']))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r' \([S|H]\)$', '', committee.text),
                            id=qs.get('Code'),
                        )

                yield event
Exemple #43
0
    def scrape_page(self, url, session, chamber):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        ctty_name = doc.xpath("//span[@class='heading']")[0].text_content()

        tables = doc.xpath("//table[@cellpadding='3']")
        info = tables[0]
        rows = info.xpath(".//tr")
        metainf = {}
        for row in rows:
            tds = row.xpath(".//td")
            key = tds[0].text_content().strip()
            value = tds[1].text_content().strip()
            metainf[key] = value

        where = metainf['Location:']
        subject_matter = metainf['Subject Matter:']
        description = "{}, {}".format(ctty_name, subject_matter)

        datetime = metainf['Scheduled Date:']
        datetime = re.sub(r"\s+", " ", datetime)
        repl = {
            "AM": " AM",
            "PM": " PM"  # Space shim.
        }
        for r in repl:
            datetime = datetime.replace(r, repl[r])
        datetime = self.localize(
            dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p"))

        event = Event(description, start_date=datetime, location_name=where)
        event.add_source(url)

        if ctty_name.startswith('Hearing Notice For'):
            ctty_name.replace('Hearing Notice For', '')
        event.add_participant(ctty_name, 'organization')

        bills = tables[1]
        for bill in bills.xpath(".//tr")[1:]:
            tds = bill.xpath(".//td")
            if len(tds) < 4:
                continue
            # First, let's get the bill ID:
            bill_id = tds[0].text_content()
            agenda_item = event.add_agenda_item(bill_id)
            agenda_item.add_bill(bill_id)

        return event
Exemple #44
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record['CommHouse']]
            except KeyError:
                self.warning('unknown committee code %s, skipping',
                             record['CommHouse'])

            description = 'Meeting of the {}'.format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record['Location'] or 'Statehouse',
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill['bill_id'])

            event.add_committee(
                hr_name,
                id=record['CommHouse'],
                note='host',
            )
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')

            yield event
Exemple #45
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        self.initialize_committees(year_abr)
        records = self.access_to_csv("Agendas")
        for record in records:
            if record['Status'] != "Scheduled":
                continue
            description = record['Comments']
            related_bills = []

            for bill in re.findall(r"(A|S)(-)?(\d{4})", description):
                related_bills.append({
                    "bill_id": "%s %s" % (bill[0], bill[2]),
                    "descr": description
                })

            date_time = "%s %s" % (record['Date'], record['Time'])
            date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p")

            try:
                hr_name = self._committees[record['CommHouse']]
            except KeyError:
                self.warning('unknown committee code %s, skipping', record['CommHouse'])

            description = 'Meeting of the {}'.format(hr_name)

            event = Event(
                name=description,
                start_date=self._tz.localize(date_time),
                location_name=record['Location'] or 'Statehouse',
            )
            item = None
            for bill in related_bills:
                item = item or event.add_agenda_item(description)
                item.add_bill(bill['bill_id'])

            event.add_committee(
                hr_name,
                id=record['CommHouse'],
                note='host',
            )
            event.add_source('http://www.njleg.state.nj.us/downloads.asp')

            yield event
    def categorize_data(self, csv_data):
        return_objs = []
        Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_'))
        for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line
            if not line:
                continue

            # cur_obj will be the person or organization that made the contribution
            cur_obj = None
            contribution = Contribution(*line.split(','))
            
            if contribution.Contributor_Type in self.business_contribution_types:
                cur_obj = Organization(contribution.Contributor_Name)
            elif contribution.Contributor_Type in self.individual_contribution_types:
                cur_obj = Person(contribution.Contributor_Name)
            elif contribution.Contributor_Type == 'Unknown/Anonymous':
                if contribution.Contributor_Name: #ignoring un-named contributors
                    #these look like catch-all business contributions
                    cur_obj = Organization(contribution.Contributor_Name)
            if cur_obj: 
                #we don't set cur_obj in the event that there was an 
                #anonymous/unknown contribution without a Contribution_Name
                #so we need to check that it exists before adding to it
                cur_obj.add_source(url=self.search_url)
                cur_obj.source_identified = True
                if contribution.Contributor_Address:
                    cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address)
                if contribution.Employer_Name:
                    cur_obj.extras['Employer'] = contribution.Employer_Name
                if contribution.Employer_Occupation:
                    cur_obj.extras['Occupation'] = contribution.Employer_Occupation
                
                #recipiant_obj is the organization that received the contribution
                recipiant_obj = Organization(contribution.Receiving_Committee)  
                recipiant_obj.extras['Office'] = contribution.Office
                recipiant_obj.extras['Filing Period'] = contribution.Filing_Period
                recipiant_obj.extras['Fundtype'] = contribution.Fundtype

                #transaction is the event linking the donor and recipiant
                transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD
                transaction.extras['Contribution Amount'] = contribution.Contribution_Amount
                transaction.extras['Contribution Type'] = contribution.Contribution_Type
                transaction.add_source(url=self.search_url)
                #transaction.source_identified = True
                transaction.participants.append(cur_obj.as_dict())
                transaction.participants.append(recipiant_obj.as_dict())
                yield (cur_obj, recipiant_obj, transaction)        
            else:
                yield []
Exemple #47
0
    def scrape(self):
        curdate = None
        page = self.lxmlize(CAL_PAGE)
        for el in page.xpath("//div[@id='Section1']/*"):
            if el.tag[0] == 'h':
                when = WHEN.findall(el.text_content())
                when = when[0] if when else None
                if when is None:
                    continue
                curdate = " ".join(when)

            if (el.tag == 'p'):  # and el.attrib.get('class') == 'MsoNormal'):

                els = el.xpath("./*")
                agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
                agenda = agenda[0] if agenda else None
                if agenda is None:
                    continue

                info = self.cleanup(el.text_content())
                when = DT.findall(info)
                when = when[0] if when else None
                if when is None:
                    continue

                people = el.xpath(".//personname")
                places = el.xpath(".//place")
                time, ampm = when

                if curdate is None:
                    self.warning(
                        "Can't scrape, since I don't know what date it is")
                    continue

                tbuf = " ".join([curdate, time, ampm])
                obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")

                try:
                    _, where = info.rsplit(u"–", 1)
                except ValueError:
                    continue

                where = where.replace(u" ", " ")
                where = re.sub("\s+", " ", where).strip()
                where = re.sub("agenda$", "", where).strip()

                event = Event(name=info, when=obj, location=where)
                event.add_source(CAL_PAGE)
                yield event
Exemple #48
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = self.jurisdiction.get_year_slug(session)

        url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format(
            year_slug
        )

        json_data = self.get(url).text
        events = json.loads(json_data)["data"]

        for info in events:
            # Determine when the committee meets
            if (
                info["TimeSlot"] == ""
                or info["TimeSlot"] == "1"
                or info["TimeSlot"] == 1
            ):
                start_time = datetime.datetime.strptime(
                    info["MeetingDate"], "%A, %B %d, %Y"
                )
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["TimeSlot"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info["MeetingDate"] + ", " + info["StartTime"],
                        "%A, %B %d, %Y, %I:%M %p",
                    )
                all_day = False

            event = Event(
                start_date=self.TIMEZONE.localize(start_time),
                all_day=all_day,
                name="Meeting of the {}".format(info["LongName"]),
                description="committee meeting",
                location_name="{0}, Room {1}".format(
                    info["BuildingName"], info["RoomNbr"]
                ),
            )
            event.add_source(url)
            event.add_committee(name=info["LongName"], note="host")

            yield event
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = (self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description)

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = "More than one committee meeting at (location, date) %r"
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = "Committee Meeting: " + committee_name
            event = Event(name=desc,
                          start_date=date,
                          location_name=committee_name)
            for bill_id in bills:
                if "B" in bill_id:
                    type_ = "bill"
                else:
                    type_ = "resolution"
                item = event.add_agenda_item("consideration")
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + " Committee", note="host")
            event.add_source("https://downloads.leginfo.legislature.ca.gov/")

            yield event
Exemple #50
0
    def scrape(self, session=None, chamber=None):
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO(page.text), delimiter='|')

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += next(page)

            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')
                when = self._tz.localize(when)
                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                description = "%s MEETING" % comm
                event = Event(
                        name=description,
                        start_time=when,
                        location_name=location,
                        description=description,
                        timezone=self._tz.zone
                )
                event.add_source(url)

                event.add_participant(comm, type='committee', note='host')
                # time = row[3].strip()
                # if time in TIMECODES:
                #     event['notes'] = TIMECODES[time]

                yield event
Exemple #51
0
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bill_ids]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date),)
            assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(
                name=desc,
                start_date=date,
                location_name=committee_name,
            )
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                item = event.add_agenda_item('consideration')
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + ' Committee', note='host')
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            yield event
    def scrape_events(self, chamber, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        # some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                # type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)

                if where is None or where == "":
                    where = 'State House'
                event = Event(name=description,
                              start_date=when,
                              location_name=where)
                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant(name,
                                                  type='committee',
                                                  note='host')

                event.add_source(url)
                yield event
        else:
            # hack so we dont fail on the first id numbers where there are some gaps
            # between the numbers that work and not.
            if event_id > 1700:
                raise Exception(
                    "Parsing is done we are on future ids that are not used yet."
                )
    def scrape(self):
        curdate = None
        page = self.lxmlize(CAL_PAGE)
        for el in page.xpath("//div[@id='Section1']/*"):
            if el.tag[0] == 'h':
                when = WHEN.findall(el.text_content())
                when = when[0] if when else None
                if when is None:
                    continue
                curdate = " ".join(when)

            if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'):

                els = el.xpath("./*")
                agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
                agenda = agenda[0] if agenda else None
                if agenda is None:
                    continue

                info = self.cleanup(el.text_content())
                when = DT.findall(info)
                when = when[0] if when else None
                if when is None:
                    continue

                people = el.xpath(".//personname")
                places = el.xpath(".//place")
                time, ampm = when

                if curdate is None:
                    self.warning("Can't scrape, since I don't know what date it is")
                    continue

                tbuf = " ".join([curdate, time, ampm])
                obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")

                try:
                    _, where = info.rsplit(u"–", 1)
                except ValueError:
                    continue

                where = where.replace(u" ", " ")
                where  = re.sub("\s+", " ", where).strip()
                where = re.sub("agenda$", "", where).strip()

                event = Event(name=info, when=obj, location=where)
                event.add_source(CAL_PAGE)
                yield event
Exemple #54
0
    def scrape_chamber(self, chamber, session, start, end):
        page = self.get_xml(start, end)

        for row in xpath(page, '//wa:CommitteeMeeting'):
            event_cancelled = xpath(row, 'string(wa:Cancelled)')
            if event_cancelled == 'true':
                continue

            event_chamber = xpath(row, 'string(wa:Agency)')
            if self.chambers[event_chamber] != chamber:
                continue

            event_date = datetime.datetime.strptime(
                xpath(row, 'string(wa:Date)'), "%Y-%m-%dT%H:%M:%S")
            event_date = self._tz.localize(event_date)
            event_com = xpath(row, 'string(wa:Committees/'
                                   'wa:Committee/wa:LongName)')
            agenda_id = xpath(row, 'string(wa:AgendaId)')
            notes = xpath(row, 'string(wa:Notes)')
            room = xpath(row, 'string(wa:Room)')
            building = xpath(row, 'string(wa:Building)')
            # XML has a wa:Address but it seems useless
            city = xpath(row, 'string(wa:City)')
            state = xpath(row, 'string(wa:State)')

            location = '{}, {}, {} {}'.format(
                room,
                building,
                city,
                state
            )

            event = Event(name=event_com, start_date=event_date,
                          location_name=location,
                          description=notes)

            source_url = 'https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}'.format(
                agenda_id)
            event.add_source(source_url)

            event.add_participant(event_com, type='committee', note='host')

            event.extras['agendaId'] = agenda_id

            self.scrape_agenda_items(agenda_id, event)

            yield event
Exemple #55
0
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo['COMMITTEE']
        where = metainfo['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo['PLACE'] = where.strip()
            metainfo['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee,
            start_date=self._tz.localize(datetime),
            location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type='committee', note='host')
        if chair is not None:
            event.add_participant(chair, type='legislator', note='chair')

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item('Bill up for discussion')
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event
Exemple #56
0
    def scrape_events(self, chamber, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        # some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                # type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)

                if where is None or where == "":
                    where = 'State House'
                event = Event(name=description,
                              start_date=when,
                              location_name=where)
                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant(name, type='committee',
                                                  note='host')

                event.add_source(url)
                yield event
        else:
            # hack so we dont fail on the first id numbers where there are some gaps
            # between the numbers that work and not.
            if event_id > 1700:
                raise Exception("Parsing is done we are on future ids that are not used yet.")